diff options
author | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2013-05-15 10:26:50 -0400 |
---|---|---|
committer | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2013-05-15 10:26:50 -0400 |
commit | 12e04ffcd93b25dfd726d46338c2ee7d23de556e (patch) | |
tree | f91479a62805619168994fd3ee55e3ffa23fc24e /net/netfilter | |
parent | 9eff37a8713939f218ab8bf0dc93f1d67af7b8b4 (diff) | |
parent | f722406faae2d073cc1d01063d1123c35425939e (diff) |
Merge tag 'v3.10-rc1' into stable/for-linus-3.10
Linux 3.10-rc1
* tag 'v3.10-rc1': (12273 commits)
Linux 3.10-rc1
[SCSI] qla2xxx: Update firmware link in Kconfig file.
[SCSI] iscsi class, qla4xxx: fix sess/conn refcounting when find fns are used
[SCSI] sas: unify the pointlessly separated enums sas_dev_type and sas_device_type
[SCSI] pm80xx: thermal, sas controller config and error handling update
[SCSI] pm80xx: NCQ error handling changes
[SCSI] pm80xx: WWN Modification for PM8081/88/89 controllers
[SCSI] pm80xx: Changed module name and debug messages update
[SCSI] pm80xx: Firmware flash memory free fix, with addition of new memory region for it
[SCSI] pm80xx: SPC new firmware changes for device id 0x8081 alone
[SCSI] pm80xx: Added SPCv/ve specific hardware functionalities and relevant changes in common files
[SCSI] pm80xx: MSI-X implementation for using 64 interrupts
[SCSI] pm80xx: Updated common functions common for SPC and SPCv/ve
[SCSI] pm80xx: Multiple inbound/outbound queue configuration
[SCSI] pm80xx: Added SPCv/ve specific ids, variables and modify for SPC
[SCSI] lpfc: fix up Kconfig dependencies
[SCSI] Handle MLQUEUE busy response in scsi_send_eh_cmnd
dm cache: set config value
dm cache: move config fns
dm thin: generate event when metadata threshold passed
...
Diffstat (limited to 'net/netfilter')
78 files changed, 5866 insertions, 5260 deletions
diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a9c488b6c50..07c865a31a3 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -5,6 +5,7 @@ * way. * * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/netfilter.h> @@ -276,10 +277,30 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int __net_init netfilter_net_init(struct net *net) +{ #ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_net_netfilter; -EXPORT_SYMBOL(proc_net_netfilter); + net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", + net->proc_net); + if (!net->nf.proc_netfilter) { + if (!net_eq(net, &init_net)) + pr_err("cannot create netfilter proc entry"); + + return -ENOMEM; + } #endif + return 0; +} + +static void __net_exit netfilter_net_exit(struct net *net) +{ + remove_proc_entry("netfilter", net->proc_net); +} + +static struct pernet_operations netfilter_net_ops = { + .init = netfilter_net_init, + .exit = netfilter_net_exit, +}; void __init netfilter_init(void) { @@ -289,11 +310,8 @@ void __init netfilter_init(void) INIT_LIST_HEAD(&nf_hooks[i][h]); } -#ifdef CONFIG_PROC_FS - proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); - if (!proc_net_netfilter) + if (register_pernet_subsys(&netfilter_net_ops) < 0) panic("cannot create netfilter proc entry"); -#endif if (netfilter_log_init() < 0) panic("cannot initialize nf_log"); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h new file mode 100644 index 00000000000..25243379b88 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -0,0 +1,277 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __IP_SET_BITMAP_IP_GEN_H +#define __IP_SET_BITMAP_IP_GEN_H + +#define CONCAT(a, b) a##b +#define TOKEN(a,b) CONCAT(a, b) + +#define mtype_do_test TOKEN(MTYPE, _do_test) +#define mtype_gc_test TOKEN(MTYPE, _gc_test) +#define mtype_is_filled TOKEN(MTYPE, _is_filled) +#define mtype_do_add TOKEN(MTYPE, _do_add) +#define mtype_do_del TOKEN(MTYPE, _do_del) +#define mtype_do_list TOKEN(MTYPE, _do_list) +#define mtype_do_head TOKEN(MTYPE, _do_head) +#define mtype_adt_elem TOKEN(MTYPE, _adt_elem) +#define mtype_add_timeout TOKEN(MTYPE, _add_timeout) +#define mtype_gc_init TOKEN(MTYPE, _gc_init) +#define mtype_kadt TOKEN(MTYPE, _kadt) +#define mtype_uadt TOKEN(MTYPE, _uadt) +#define mtype_destroy TOKEN(MTYPE, _destroy) +#define mtype_flush TOKEN(MTYPE, _flush) +#define mtype_head TOKEN(MTYPE, _head) +#define mtype_same_set TOKEN(MTYPE, _same_set) +#define mtype_elem TOKEN(MTYPE, _elem) +#define mtype_test TOKEN(MTYPE, _test) +#define mtype_add TOKEN(MTYPE, _add) +#define mtype_del TOKEN(MTYPE, _del) +#define mtype_list TOKEN(MTYPE, _list) +#define mtype_gc TOKEN(MTYPE, _gc) +#define mtype MTYPE + +#define ext_timeout(e, m) \ + (unsigned long *)((e) + (m)->offset[IPSET_OFFSET_TIMEOUT]) +#define ext_counter(e, m) \ + (struct ip_set_counter *)((e) + (m)->offset[IPSET_OFFSET_COUNTER]) +#define get_ext(map, id) ((map)->extensions + (map)->dsize * (id)) + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct mtype *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +mtype_destroy(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + if (map->dsize) + ip_set_free(map->extensions); + kfree(map); + + set->data = NULL; +} + +static void +mtype_flush(struct ip_set *set) +{ + struct mtype *map = set->data; + + memset(map->members, 0, map->memsize); +} + +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct mtype *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (mtype_do_head(skb, map) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + + map->memsize + + map->dsize * map->elements)) || + (SET_WITH_TIMEOUT(set) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) || + (SET_WITH_COUNTER(set) && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, + htonl(IPSET_FLAG_WITH_COUNTERS)))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(map, e->id); + int ret = mtype_do_test(e, map); + + if (ret <= 0) + return ret; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, map))) + return 0; + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(x, map), ext, mext, flags); + return 1; +} + +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(map, e->id); + int ret = mtype_do_add(e, map, flags); + + if (ret == IPSET_ADD_FAILED) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, map))) + ret = 0; + else if (!(flags & IPSET_FLAG_EXIST)) + return -IPSET_ERR_EXIST; + } + + if (SET_WITH_TIMEOUT(set)) +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_add_timeout(ext_timeout(x, map), e, ext, map, ret); +#else + ip_set_timeout_set(ext_timeout(x, map), ext->timeout); +#endif + + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(x, map), ext); + return 0; +} + +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + const void *x = get_ext(map, e->id); + + if (mtype_do_del(e, map) || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, map)))) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mtype *map = set->data; + struct nlattr *adt, *nested; + void *x; + u32 id, first = cb->args[2]; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[2] < map->elements; cb->args[2]++) { + id = cb->args[2]; + x = get_ext(map, id); + if (!test_bit(id, map->members) || + (SET_WITH_TIMEOUT(set) && +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_is_filled((const struct mtype_elem *) x) && +#endif + ip_set_timeout_expired(ext_timeout(x, map)))) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_do_list(skb, map, id)) + goto nla_put_failure; + if (SET_WITH_TIMEOUT(set)) { +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_stored(map, id, + ext_timeout(x, map))))) + goto nla_put_failure; +#else + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get( + ext_timeout(x, map))))) + goto nla_put_failure; +#endif + } + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(x, map))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, adt); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct mtype *map = set->data; + const void *x; + u32 id; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id < map->elements; id++) + if (mtype_gc_test(id, map)) { + x = get_ext(map, id); + if (ip_set_timeout_expired(ext_timeout(x, map))) + clear_bit(id, map->members); + } + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static const struct ip_set_type_variant mtype = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .same_set = mtype_same_set, +}; + +#endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 4a92fd47bd4..f1a8128bef0 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -24,31 +24,37 @@ #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> -#define IP_SET_BITMAP_TIMEOUT -#include <linux/netfilter/ipset/ip_set_timeout.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counter support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:ip", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); +#define MTYPE bitmap_ip + /* Type structure */ struct bitmap_ip { void *members; /* the set members */ + void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ u32 hosts; /* number of hosts in a subnet */ size_t memsize; /* members size */ + size_t dsize; /* extensions struct size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ u8 netmask; /* subnet netmask */ u32 timeout; /* timeout parameter */ struct timer_list gc; /* garbage collection */ }; -/* Base variant */ +/* ADT structure for generic function args */ +struct bitmap_ip_adt_elem { + u16 id; +}; static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) @@ -56,188 +62,67 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; } -static int -bitmap_ip_test(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - const struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - return !!test_bit(id, map->members); -} +/* Common functions */ -static int -bitmap_ip_add(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - if (test_and_set_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; + return !!test_bit(e->id, map->members); } -static int -bitmap_ip_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - if (!test_and_clear_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; -} - -static int -bitmap_ip_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) -{ - const struct bitmap_ip *map = set->data; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] < map->elements; cb->args[2]++) { - id = cb->args[2]; - if (!test_bit(id, map->members)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id * map->hosts))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return !!test_bit(id, map->members); } -/* Timeout variant */ - -static int -bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, + u32 flags) { - const struct bitmap_ip *map = set->data; - const unsigned long *members = map->members; - u16 id = *(u16 *)value; - - return ip_set_timeout_test(members[id]); + return !!test_and_set_bit(e->id, map->members); } -static int -bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - - if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) - return -IPSET_ERR_EXIST; - - members[id] = ip_set_timeout_set(timeout); - - return 0; + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id) { - struct bitmap_ip *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - int ret = -IPSET_ERR_EXIST; - - if (ip_set_timeout_test(members[id])) - ret = 0; - - members[id] = IPSET_ELEM_UNSET; - return ret; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); } -static int -bitmap_ip_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { - const struct bitmap_ip *map = set->data; - struct nlattr *adt, *nested; - u32 id, first = cb->args[2]; - const unsigned long *members = map->members; - - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!adt) - return -EMSGSIZE; - for (; cb->args[2] < map->elements; cb->args[2]++) { - id = cb->args[2]; - if (!ip_set_timeout_test(members[id])) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id * map->hosts)) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(members[id])))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, adt); - - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, adt); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || + (map->netmask != 32 && + nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)); } static int bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); u32 ip; ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; - ip = ip_to_id(map, ip); + e.id = ip_to_id(map, ip); - return adtfn(set, &ip, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -246,33 +131,31 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 timeout = map->timeout; - u32 ip, ip_to, id; + u32 ip, ip_to; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - if (adt == IPSET_TEST) { - id = ip_to_id(map, ip); - return adtfn(set, &id, timeout, flags); + e.id = ip_to_id(map, ip); + return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_IP_TO]) { @@ -297,8 +180,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { - id = ip_to_id(map, ip); - ret = adtfn(set, &id, timeout, flags); + e.id = ip_to_id(map, ip); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -308,54 +191,6 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static void -bitmap_ip_destroy(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_ip_flush(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - memset(map->members, 0, map->memsize); -} - -static int -bitmap_ip_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_ip *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || - (map->netmask != 32 && - nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->memsize)) || - (with_timeout(map->timeout) && - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -365,70 +200,35 @@ bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_ip == y->first_ip && x->last_ip == y->last_ip && x->netmask == y->netmask && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_ip = { - .kadt = bitmap_ip_kadt, - .uadt = bitmap_ip_uadt, - .adt = { - [IPSET_ADD] = bitmap_ip_add, - [IPSET_DEL] = bitmap_ip_del, - [IPSET_TEST] = bitmap_ip_test, - }, - .destroy = bitmap_ip_destroy, - .flush = bitmap_ip_flush, - .head = bitmap_ip_head, - .list = bitmap_ip_list, - .same_set = bitmap_ip_same_set, +/* Plain variant */ + +struct bitmap_ip_elem { }; -static const struct ip_set_type_variant bitmap_tip = { - .kadt = bitmap_ip_kadt, - .uadt = bitmap_ip_uadt, - .adt = { - [IPSET_ADD] = bitmap_ip_tadd, - [IPSET_DEL] = bitmap_ip_tdel, - [IPSET_TEST] = bitmap_ip_ttest, - }, - .destroy = bitmap_ip_destroy, - .flush = bitmap_ip_flush, - .head = bitmap_ip_head, - .list = bitmap_ip_tlist, - .same_set = bitmap_ip_same_set, +/* Timeout variant */ + +struct bitmap_ipt_elem { + unsigned long timeout; }; -static void -bitmap_ip_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_ip *map = set->data; - unsigned long *table = map->members; - u32 id; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id < map->elements; id++) - if (ip_set_timeout_expired(table[id])) - table[id] = IPSET_ELEM_UNSET; - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Plain variant with counter */ -static void -bitmap_ip_gc_init(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; +struct bitmap_ipc_elem { + struct ip_set_counter counter; +}; - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_ip_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Timeout variant with counter */ + +struct bitmap_ipct_elem { + unsigned long timeout; + struct ip_set_counter counter; +}; + +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ @@ -440,6 +240,13 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (map->dsize) { + map->extensions = ip_set_alloc(map->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -457,13 +264,14 @@ static int bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { struct bitmap_ip *map; - u32 first_ip, last_ip, hosts; + u32 first_ip, last_ip, hosts, cadt_flags = 0; u64 elements; u8 netmask = 32; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); @@ -526,8 +334,45 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (!map) return -ENOMEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - map->memsize = elements * sizeof(unsigned long); + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ip; + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipct_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipct_elem, counter); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget( + tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + + bitmap_ip_gc_init(set, bitmap_ip_gc); + } else { + map->dsize = sizeof(struct bitmap_ipc_elem); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipc_elem, counter); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipt_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipt_elem, timeout); if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { @@ -536,19 +381,16 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) } map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - set->variant = &bitmap_tip; + set->extensions |= IPSET_EXT_TIMEOUT; - bitmap_ip_gc_init(set); + bitmap_ip_gc_init(set, bitmap_ip_gc); } else { - map->memsize = bitmap_bytes(0, elements - 1); - + map->dsize = 0; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { kfree(map); return -ENOMEM; } - - set->variant = &bitmap_ip; } return 0; } @@ -568,6 +410,7 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -575,6 +418,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 0f92dc24cb8..3b30e0bef89 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,340 +23,208 @@ #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counter support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:ip,mac", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); +#define MTYPE bitmap_ipmac +#define IP_SET_BITMAP_STORED_TIMEOUT + enum { - MAC_EMPTY, /* element is not set */ - MAC_FILLED, /* element is set with MAC */ MAC_UNSET, /* element is set, without MAC */ + MAC_FILLED, /* element is set with MAC */ }; /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ + void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ u32 timeout; /* timeout value */ struct timer_list gc; /* garbage collector */ + size_t memsize; /* members size */ size_t dsize; /* size of element */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ }; /* ADT structure for generic function args */ -struct ipmac { - u32 id; /* id in array */ - unsigned char *ether; /* ethernet address */ +struct bitmap_ipmac_adt_elem { + u16 id; + unsigned char *ether; }; -/* Member element without and with timeout */ - -struct ipmac_elem { +struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; - unsigned char match; + unsigned char filled; } __attribute__ ((aligned)); -struct ipmac_telem { - unsigned char ether[ETH_ALEN]; - unsigned char match; - unsigned long timeout; -} __attribute__ ((aligned)); - -static inline void * -bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id) +static inline u32 +ip_to_id(const struct bitmap_ipmac *m, u32 ip) { - return (void *)((char *)map->members + id * map->dsize); + return ip - m->first_ip; } -static inline bool -bitmap_timeout(const struct bitmap_ipmac *map, u32 id) +static inline struct bitmap_ipmac_elem * +get_elem(void *extensions, u16 id, size_t dsize) { - const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); - - return ip_set_timeout_test(elem->timeout); + return (struct bitmap_ipmac_elem *)(extensions + id * dsize); } -static inline bool -bitmap_expired(const struct bitmap_ipmac *map, u32 id) -{ - const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); - - return ip_set_timeout_expired(elem->timeout); -} +/* Common functions */ static inline int -bitmap_ipmac_exist(const struct ipmac_telem *elem) +bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, + const struct bitmap_ipmac *map) { - return elem->match == MAC_UNSET || - (elem->match == MAC_FILLED && - !ip_set_timeout_expired(elem->timeout)); -} + const struct bitmap_ipmac_elem *elem; -/* Base variant */ - -static int -bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - const struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - /* Trigger kernel to fill out the ethernet address */ - return -EAGAIN; - case MAC_FILLED: - return data->ether == NULL || - ether_addr_equal(data->ether, elem->ether); - } - return 0; -} - -static int -bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - if (!data->ether) - /* Already added without ethernet address */ - return -IPSET_ERR_EXIST; - /* Fill the MAC address */ - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - break; - case MAC_FILLED: - return -IPSET_ERR_EXIST; - case MAC_EMPTY: - if (data->ether) { - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - } else - elem->match = MAC_UNSET; - } - - return 0; -} - -static int -bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - if (elem->match == MAC_EMPTY) - return -IPSET_ERR_EXIST; - - elem->match = MAC_EMPTY; - - return 0; + if (!test_bit(e->id, map->members)) + return 0; + elem = get_elem(map->extensions, e->id, map->dsize); + if (elem->filled == MAC_FILLED) + return e->ether == NULL || + ether_addr_equal(e->ether, elem->ether); + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; } -static int -bitmap_ipmac_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac_elem *elem; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - u32 last = map->last_ip - map->first_ip; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - elem = bitmap_ipmac_elem(map, id); - if (elem->match == MAC_EMPTY) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id)) || - (elem->match == MAC_FILLED && - nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, - elem->ether))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - - return 0; + const struct bitmap_ipmac_elem *elem; -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + if (!test_bit(id, map->members)) + return 0; + elem = get_elem(map->extensions, id, map->dsize); + /* Timer not started for the incomplete elements */ + return elem->filled == MAC_FILLED; } -/* Timeout variant */ - -static int -bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - /* Trigger kernel to fill out the ethernet address */ - return -EAGAIN; - case MAC_FILLED: - return (data->ether == NULL || - ether_addr_equal(data->ether, elem->ether)) && - !bitmap_expired(map, data->id); - } - return 0; + return elem->filled == MAC_FILLED; } -static int -bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_add_timeout(unsigned long *timeout, + const struct bitmap_ipmac_adt_elem *e, + const struct ip_set_ext *ext, + struct bitmap_ipmac *map, int mode) { - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); - bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 t = ext->timeout; - switch (elem->match) { - case MAC_UNSET: - if (!(data->ether || flag_exist)) - /* Already added without ethernet address */ - return -IPSET_ERR_EXIST; - /* Fill the MAC address and activate the timer */ - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - if (timeout == map->timeout) + if (mode == IPSET_ADD_START_STORED_TIMEOUT) { + if (t == map->timeout) /* Timeout was not specified, get stored one */ - timeout = elem->timeout; - elem->timeout = ip_set_timeout_set(timeout); - break; - case MAC_FILLED: - if (!(bitmap_expired(map, data->id) || flag_exist)) - return -IPSET_ERR_EXIST; - /* Fall through */ - case MAC_EMPTY: - if (data->ether) { - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - } else - elem->match = MAC_UNSET; + t = *timeout; + ip_set_timeout_set(timeout, t); + } else { /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, * possibly by the kernel */ - elem->timeout = data->ether ? ip_set_timeout_set(timeout) - : timeout; - break; + if (e->ether) + ip_set_timeout_set(timeout, t); + else + *timeout = t; } - return 0; } -static int -bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map, u32 flags) { - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); + struct bitmap_ipmac_elem *elem; + + elem = get_elem(map->extensions, e->id, map->dsize); + if (test_and_set_bit(e->id, map->members)) { + if (elem->filled == MAC_FILLED) { + if (e->ether && (flags & IPSET_FLAG_EXIST)) + memcpy(elem->ether, e->ether, ETH_ALEN); + return IPSET_ADD_FAILED; + } else if (!e->ether) + /* Already added without ethernet address */ + return IPSET_ADD_FAILED; + /* Fill the MAC address and trigger the timer activation */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return IPSET_ADD_START_STORED_TIMEOUT; + } else if (e->ether) { + /* We can store MAC too */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return 0; + } else { + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; + } +} - if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id)) - return -IPSET_ERR_EXIST; +static inline int +bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map) +{ + return !test_and_clear_bit(e->id, map->members); +} - elem->match = MAC_EMPTY; +static inline unsigned long +ip_set_timeout_stored(struct bitmap_ipmac *map, u32 id, unsigned long *timeout) +{ + const struct bitmap_ipmac_elem *elem = + get_elem(map->extensions, id, map->dsize); - return 0; + return elem->filled == MAC_FILLED ? ip_set_timeout_get(timeout) : + *timeout; } -static int -bitmap_ipmac_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, + u32 id) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac_telem *elem; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - u32 timeout, last = map->last_ip - map->first_ip; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - elem = bitmap_ipmac_elem(map, id); - if (!bitmap_ipmac_exist(elem)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id)) || - (elem->match == MAC_FILLED && - nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, - elem->ether))) - goto nla_put_failure; - timeout = elem->match == MAC_UNSET ? elem->timeout - : ip_set_timeout_get(elem->timeout); - if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; + const struct bitmap_ipmac_elem *elem = + get_elem(map->extensions, id, map->dsize); - return 0; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)) || + (elem->filled == MAC_FILLED && + nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); +} -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - return -EMSGSIZE; +static inline int +bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) +{ + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); } static int bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct ipmac data; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); + u32 ip; /* MAC can be src only */ if (!(opt->flags & IPSET_DIM_TWO_SRC)) return 0; - data.id = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); - if (data.id < map->first_ip || data.id > map->last_ip) + ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ @@ -364,10 +232,10 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - data.id -= map->first_ip; - data.ether = eth_hdr(skb)->h_source; + e.id = ip_to_id(map, ip); + e.ether = eth_hdr(skb)->h_source; - return adtfn(set, &data, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -376,91 +244,39 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct ipmac data; - u32 timeout = map->timeout; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); + u32 ip; int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (data.id < map->first_ip || data.id > map->last_ip) + if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; + e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) - data.ether = nla_data(tb[IPSET_ATTR_ETHER]); + e.ether = nla_data(tb[IPSET_ATTR_ETHER]); else - data.ether = NULL; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + e.ether = NULL; - data.id -= map->first_ip; - - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } -static void -bitmap_ipmac_destroy(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_ipmac_flush(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - memset(map->members, 0, - (map->last_ip - map->first_ip + 1) * map->dsize); -} - -static int -bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_ipmac *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + - ((map->last_ip - map->first_ip + 1) * - map->dsize))) || - (with_timeout(map->timeout) && - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -469,85 +285,64 @@ bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_ip == y->first_ip && x->last_ip == y->last_ip && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_ipmac = { - .kadt = bitmap_ipmac_kadt, - .uadt = bitmap_ipmac_uadt, - .adt = { - [IPSET_ADD] = bitmap_ipmac_add, - [IPSET_DEL] = bitmap_ipmac_del, - [IPSET_TEST] = bitmap_ipmac_test, - }, - .destroy = bitmap_ipmac_destroy, - .flush = bitmap_ipmac_flush, - .head = bitmap_ipmac_head, - .list = bitmap_ipmac_list, - .same_set = bitmap_ipmac_same_set, -}; +/* Plain variant */ -static const struct ip_set_type_variant bitmap_tipmac = { - .kadt = bitmap_ipmac_kadt, - .uadt = bitmap_ipmac_uadt, - .adt = { - [IPSET_ADD] = bitmap_ipmac_tadd, - [IPSET_DEL] = bitmap_ipmac_tdel, - [IPSET_TEST] = bitmap_ipmac_ttest, - }, - .destroy = bitmap_ipmac_destroy, - .flush = bitmap_ipmac_flush, - .head = bitmap_ipmac_head, - .list = bitmap_ipmac_tlist, - .same_set = bitmap_ipmac_same_set, +/* Timeout variant */ + +struct bitmap_ipmact_elem { + struct { + unsigned char ether[ETH_ALEN]; + unsigned char filled; + } __attribute__ ((aligned)); + unsigned long timeout; }; -static void -bitmap_ipmac_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_ipmac *map = set->data; - struct ipmac_telem *elem; - u32 id, last = map->last_ip - map->first_ip; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id <= last; id++) { - elem = bitmap_ipmac_elem(map, id); - if (elem->match == MAC_FILLED && - ip_set_timeout_expired(elem->timeout)) - elem->match = MAC_EMPTY; - } - read_unlock_bh(&set->lock); +/* Plain variant with counter */ - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +struct bitmap_ipmacc_elem { + struct { + unsigned char ether[ETH_ALEN]; + unsigned char filled; + } __attribute__ ((aligned)); + struct ip_set_counter counter; +}; -static void -bitmap_ipmac_gc_init(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; +/* Timeout variant with counter */ - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_ipmac_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +struct bitmap_ipmacct_elem { + struct { + unsigned char ether[ETH_ALEN]; + unsigned char filled; + } __attribute__ ((aligned)); + unsigned long timeout; + struct ip_set_counter counter; +}; + +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip,mac type of sets */ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, - u32 first_ip, u32 last_ip) + u32 first_ip, u32 last_ip, u32 elements) { map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize); if (!map->members) return false; + if (map->dsize) { + map->extensions = ip_set_alloc(map->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_ip = first_ip; map->last_ip = last_ip; + map->elements = elements; map->timeout = IPSET_NO_TIMEOUT; set->data = map; @@ -560,13 +355,14 @@ static int bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { - u32 first_ip, last_ip; + u32 first_ip, last_ip, cadt_flags = 0; u64 elements; struct bitmap_ipmac *map; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); @@ -601,28 +397,59 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - map->dsize = sizeof(struct ipmac_telem); + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ipmac; + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipmacct_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipmacct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipmacct_elem, counter); + + if (!init_map_ipmac(set, map, first_ip, last_ip, + elements)) { + kfree(map); + return -ENOMEM; + } + map->timeout = ip_set_timeout_uget( + tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); + } else { + map->dsize = sizeof(struct bitmap_ipmacc_elem); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipmacc_elem, counter); + + if (!init_map_ipmac(set, map, first_ip, last_ip, + elements)) { + kfree(map); + return -ENOMEM; + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipmact_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipmact_elem, timeout); - if (!init_map_ipmac(set, map, first_ip, last_ip)) { + if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; } - map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = &bitmap_tipmac; - - bitmap_ipmac_gc_init(set); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); } else { - map->dsize = sizeof(struct ipmac_elem); + map->dsize = sizeof(struct bitmap_ipmac_elem); - if (!init_map_ipmac(set, map, first_ip, last_ip)) { + if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; } set->variant = &bitmap_ipmac; - } return 0; } @@ -641,6 +468,7 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -648,6 +476,8 @@ static struct ip_set_type bitmap_ipmac_type = { .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index e6b2db76f4c..8207d1fda52 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,205 +19,94 @@ #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #include <linux/netfilter/ipset/ip_set_getport.h> -#define IP_SET_BITMAP_TIMEOUT -#include <linux/netfilter/ipset/ip_set_timeout.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counter support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:port", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_bitmap:port"); +#define MTYPE bitmap_port + /* Type structure */ struct bitmap_port { void *members; /* the set members */ + void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ + size_t dsize; /* extensions struct size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ u32 timeout; /* timeout parameter */ struct timer_list gc; /* garbage collection */ }; -/* Base variant */ +/* ADT structure for generic function args */ +struct bitmap_port_adt_elem { + u16 id; +}; -static int -bitmap_port_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline u16 +port_to_id(const struct bitmap_port *m, u16 port) { - const struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - return !!test_bit(id, map->members); + return port - m->first_port; } -static int -bitmap_port_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - if (test_and_set_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; -} +/* Common functions */ -static int -bitmap_port_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_test(const struct bitmap_port_adt_elem *e, + const struct bitmap_port *map) { - struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - if (!test_and_clear_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; + return !!test_bit(e->id, map->members); } -static int -bitmap_port_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_port_gc_test(u16 id, const struct bitmap_port *map) { - const struct bitmap_port *map = set->data; - struct nlattr *atd, *nested; - u16 id, first = cb->args[2]; - u16 last = map->last_port - map->first_port; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - if (!test_bit(id, map->members)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_net16(skb, IPSET_ATTR_PORT, - htons(map->first_port + id))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return !!test_bit(id, map->members); } -/* Timeout variant */ - -static int -bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_add(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map, u32 flags) { - const struct bitmap_port *map = set->data; - const unsigned long *members = map->members; - u16 id = *(u16 *)value; - - return ip_set_timeout_test(members[id]); + return !!test_and_set_bit(e->id, map->members); } -static int -bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_del(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map) { - struct bitmap_port *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - - if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) - return -IPSET_ERR_EXIST; - - members[id] = ip_set_timeout_set(timeout); - - return 0; + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id) { - struct bitmap_port *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - int ret = -IPSET_ERR_EXIST; - - if (ip_set_timeout_test(members[id])) - ret = 0; - - members[id] = IPSET_ELEM_UNSET; - return ret; + return nla_put_net16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); } -static int -bitmap_port_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { - const struct bitmap_port *map = set->data; - struct nlattr *adt, *nested; - u16 id, first = cb->args[2]; - u16 last = map->last_port - map->first_port; - const unsigned long *members = map->members; - - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!adt) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - if (!ip_set_timeout_test(members[id])) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_net16(skb, IPSET_ATTR_PORT, - htons(map->first_port + id)) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(members[id])))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, adt); - - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, adt); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || + nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); __be16 __port; u16 port = 0; @@ -230,9 +119,9 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; - port -= map->first_port; + e.id = port_to_id(map, port); - return adtfn(set, &port, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -241,14 +130,17 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 timeout = map->timeout; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); u32 port; /* wraparound */ - u16 id, port_to; + u16 port_to; int ret = 0; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -257,16 +149,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; if (adt == IPSET_TEST) { - id = port - map->first_port; - return adtfn(set, &id, timeout, flags); + e.id = port_to_id(map, port); + return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_PORT_TO]) { @@ -283,8 +172,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; for (; port <= port_to; port++) { - id = port - map->first_port; - ret = adtfn(set, &id, timeout, flags); + e.id = port_to_id(map, port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -294,52 +183,6 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static void -bitmap_port_destroy(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_port_flush(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - memset(map->members, 0, map->memsize); -} - -static int -bitmap_port_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_port *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - if (nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || - nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->memsize)) || - (with_timeout(map->timeout) && - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -348,71 +191,35 @@ bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_port == y->first_port && x->last_port == y->last_port && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_port = { - .kadt = bitmap_port_kadt, - .uadt = bitmap_port_uadt, - .adt = { - [IPSET_ADD] = bitmap_port_add, - [IPSET_DEL] = bitmap_port_del, - [IPSET_TEST] = bitmap_port_test, - }, - .destroy = bitmap_port_destroy, - .flush = bitmap_port_flush, - .head = bitmap_port_head, - .list = bitmap_port_list, - .same_set = bitmap_port_same_set, +/* Plain variant */ + +struct bitmap_port_elem { }; -static const struct ip_set_type_variant bitmap_tport = { - .kadt = bitmap_port_kadt, - .uadt = bitmap_port_uadt, - .adt = { - [IPSET_ADD] = bitmap_port_tadd, - [IPSET_DEL] = bitmap_port_tdel, - [IPSET_TEST] = bitmap_port_ttest, - }, - .destroy = bitmap_port_destroy, - .flush = bitmap_port_flush, - .head = bitmap_port_head, - .list = bitmap_port_tlist, - .same_set = bitmap_port_same_set, +/* Timeout variant */ + +struct bitmap_portt_elem { + unsigned long timeout; }; -static void -bitmap_port_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_port *map = set->data; - unsigned long *table = map->members; - u32 id; /* wraparound */ - u16 last = map->last_port - map->first_port; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id <= last; id++) - if (ip_set_timeout_expired(table[id])) - table[id] = IPSET_ELEM_UNSET; - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Plain variant with counter */ -static void -bitmap_port_gc_init(struct ip_set *set) -{ - struct bitmap_port *map = set->data; +struct bitmap_portc_elem { + struct ip_set_counter counter; +}; - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_port_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Timeout variant with counter */ + +struct bitmap_portct_elem { + unsigned long timeout; + struct ip_set_counter counter; +}; + +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ @@ -423,6 +230,13 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (map->dsize) { + map->extensions = ip_set_alloc(map->dsize * map->elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_port = first_port; map->last_port = last_port; map->timeout = IPSET_NO_TIMEOUT; @@ -434,15 +248,16 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, } static int -bitmap_port_create(struct ip_set *set, struct nlattr *tb[], - u32 flags) +bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { struct bitmap_port *map; u16 first_port, last_port; + u32 cadt_flags = 0; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); @@ -458,28 +273,56 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - map->memsize = (last_port - first_port + 1) - * sizeof(unsigned long); - + map->elements = last_port - first_port + 1; + map->memsize = map->elements * sizeof(unsigned long); + set->variant = &bitmap_port; + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_portct_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_portct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_portct_elem, counter); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = + ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_port_gc_init(set, bitmap_port_gc); + } else { + map->dsize = sizeof(struct bitmap_portc_elem); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_portc_elem, counter); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_portt_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_portt_elem, timeout); if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; } map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - set->variant = &bitmap_tport; - - bitmap_port_gc_init(set); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_port_gc_init(set, bitmap_port_gc); } else { - map->memsize = bitmap_bytes(0, last_port - first_port); - pr_debug("memsize: %zu\n", map->memsize); + map->dsize = 0; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; } - set->variant = &bitmap_port; } return 0; } @@ -497,12 +340,15 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 1ba9dbc0e10..f7713900798 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -15,7 +15,6 @@ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/spinlock.h> -#include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> @@ -316,6 +315,29 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +int +ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], + struct ip_set_ext *ext) +{ + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!(set->extensions & IPSET_EXT_TIMEOUT)) + return -IPSET_ERR_TIMEOUT; + ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { + if (!(set->extensions & IPSET_EXT_COUNTER)) + return -IPSET_ERR_COUNTER; + if (tb[IPSET_ATTR_BYTES]) + ext->bytes = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_BYTES])); + if (tb[IPSET_ATTR_PACKETS]) + ext->packets = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_PACKETS])); + } + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_extensions); + /* * Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace @@ -366,8 +388,7 @@ ip_set_rcu_get(ip_set_id_t index) int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(index); int ret = 0; @@ -392,7 +413,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, ret = 1; } else { /* --return-nomatch: invert matched element */ - if ((opt->flags & IPSET_RETURN_NOMATCH) && + if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && (set->type->features & IPSET_TYPE_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY)) ret = -ret; @@ -405,8 +426,7 @@ EXPORT_SYMBOL_GPL(ip_set_test); int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(index); int ret; @@ -428,8 +448,7 @@ EXPORT_SYMBOL_GPL(ip_set_add); int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(index); int ret = 0; @@ -1085,7 +1104,7 @@ static int dump_init(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; @@ -1301,7 +1320,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct sk_buff *skb2; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *cmdattr; u32 *errline; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h new file mode 100644 index 00000000000..57beb1762b2 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -0,0 +1,1100 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _IP_SET_HASH_GEN_H +#define _IP_SET_HASH_GEN_H + +#include <linux/rcupdate.h> +#include <linux/jhash.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#ifndef rcu_dereference_bh +#define rcu_dereference_bh(p) rcu_dereference(p) +#endif + +#define CONCAT(a, b) a##b +#define TOKEN(a, b) CONCAT(a, b) + +/* Hashing which uses arrays to resolve clashing. The hash table is resized + * (doubled) when searching becomes too long. + * Internally jhash is used with the assumption that the size of the + * stored data is a multiple of sizeof(u32). If storage supports timeout, + * the timeout field must be the last one in the data structure - that field + * is ignored when computing the hash key. + * + * Readers and resizing + * + * Resizing can be triggered by userspace command only, and those + * are serialized by the nfnl mutex. During resizing the set is + * read-locked, so the only possible concurrent operations are + * the kernel side readers. Those must be protected by proper RCU locking. + */ + +/* Number of elements to store in an initial array block */ +#define AHASH_INIT_SIZE 4 +/* Max number of elements to store in an array block */ +#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) + +/* Max number of elements can be tuned */ +#ifdef IP_SET_HASH_WITH_MULTI +#define AHASH_MAX(h) ((h)->ahash_max) + +static inline u8 +tune_ahash_max(u8 curr, u32 multi) +{ + u32 n; + + if (multi < curr) + return curr; + + n = curr + AHASH_INIT_SIZE; + /* Currently, at listing one hash bucket must fit into a message. + * Therefore we have a hard limit here. + */ + return n > curr && n <= 64 ? n : curr; +} +#define TUNE_AHASH_MAX(h, multi) \ + ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#else +#define AHASH_MAX(h) AHASH_MAX_SIZE +#define TUNE_AHASH_MAX(h, multi) +#endif + +/* A hash bucket */ +struct hbucket { + void *value; /* the array of the values */ + u8 size; /* size of the array */ + u8 pos; /* position of the first free entry */ +}; + +/* The hash table: the table size stored here in order to make resizing easy */ +struct htable { + u8 htable_bits; /* size of hash table == 2^htable_bits */ + struct hbucket bucket[0]; /* hashtable buckets */ +}; + +#define hbucket(h, i) (&((h)->bucket[i])) + +/* Book-keeping of the prefixes added to the set */ +struct net_prefixes { + u8 cidr; /* the different cidr values in the set */ + u32 nets; /* number of elements per cidr */ +}; + +/* Compute the hash table size */ +static size_t +htable_size(u8 hbits) +{ + size_t hsize; + + /* We must fit both into u32 in jhash and size_t */ + if (hbits > 31) + return 0; + hsize = jhash_size(hbits); + if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + < hsize) + return 0; + + return hsize * sizeof(struct hbucket) + sizeof(struct htable); +} + +/* Compute htable_bits from the user input parameter hashsize */ +static u8 +htable_bits(u32 hashsize) +{ + /* Assume that hashsize == 2^htable_bits */ + u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) + /* Round up to the first 2^n value */ + bits = fls(hashsize); + + return bits; +} + +/* Destroy the hashtable part of the set */ +static void +ahash_destroy(struct htable *t) +{ + struct hbucket *n; + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) + /* FIXME: use slab cache */ + kfree(n->value); + } + + ip_set_free(t); +} + +static int +hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) +{ + if (n->pos >= n->size) { + void *tmp; + + if (n->size >= ahash_max) + /* Trigger rehashing */ + return -EAGAIN; + + tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (n->size) { + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + } + n->value = tmp; + n->size += AHASH_INIT_SIZE; + } + return 0; +} + +#ifdef IP_SET_HASH_WITH_NETS +#ifdef IP_SET_HASH_WITH_NETS_PACKED +/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */ +#define CIDR(cidr) (cidr + 1) +#else +#define CIDR(cidr) (cidr) +#endif + +#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) + +#ifdef IP_SET_HASH_WITH_MULTI +#define NETS_LENGTH(family) (SET_HOST_MASK(family) + 1) +#else +#define NETS_LENGTH(family) SET_HOST_MASK(family) +#endif + +#else +#define NETS_LENGTH(family) 0 +#endif /* IP_SET_HASH_WITH_NETS */ + +#define ext_timeout(e, h) \ +(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_TIMEOUT]) +#define ext_counter(e, h) \ +(struct ip_set_counter *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_COUNTER]) + +#endif /* _IP_SET_HASH_GEN_H */ + +/* Family dependent templates */ + +#undef ahash_data +#undef mtype_data_equal +#undef mtype_do_data_match +#undef mtype_data_set_flags +#undef mtype_data_reset_flags +#undef mtype_data_netmask +#undef mtype_data_list +#undef mtype_data_next +#undef mtype_elem + +#undef mtype_add_cidr +#undef mtype_del_cidr +#undef mtype_ahash_memsize +#undef mtype_flush +#undef mtype_destroy +#undef mtype_gc_init +#undef mtype_same_set +#undef mtype_kadt +#undef mtype_uadt +#undef mtype + +#undef mtype_add +#undef mtype_del +#undef mtype_test_cidrs +#undef mtype_test +#undef mtype_expire +#undef mtype_resize +#undef mtype_head +#undef mtype_list +#undef mtype_gc +#undef mtype_gc_init +#undef mtype_variant +#undef mtype_data_match + +#undef HKEY + +#define mtype_data_equal TOKEN(MTYPE, _data_equal) +#ifdef IP_SET_HASH_WITH_NETS +#define mtype_do_data_match TOKEN(MTYPE, _do_data_match) +#else +#define mtype_do_data_match(d) 1 +#endif +#define mtype_data_set_flags TOKEN(MTYPE, _data_set_flags) +#define mtype_data_reset_flags TOKEN(MTYPE, _data_reset_flags) +#define mtype_data_netmask TOKEN(MTYPE, _data_netmask) +#define mtype_data_list TOKEN(MTYPE, _data_list) +#define mtype_data_next TOKEN(MTYPE, _data_next) +#define mtype_elem TOKEN(MTYPE, _elem) +#define mtype_add_cidr TOKEN(MTYPE, _add_cidr) +#define mtype_del_cidr TOKEN(MTYPE, _del_cidr) +#define mtype_ahash_memsize TOKEN(MTYPE, _ahash_memsize) +#define mtype_flush TOKEN(MTYPE, _flush) +#define mtype_destroy TOKEN(MTYPE, _destroy) +#define mtype_gc_init TOKEN(MTYPE, _gc_init) +#define mtype_same_set TOKEN(MTYPE, _same_set) +#define mtype_kadt TOKEN(MTYPE, _kadt) +#define mtype_uadt TOKEN(MTYPE, _uadt) +#define mtype MTYPE + +#define mtype_elem TOKEN(MTYPE, _elem) +#define mtype_add TOKEN(MTYPE, _add) +#define mtype_del TOKEN(MTYPE, _del) +#define mtype_test_cidrs TOKEN(MTYPE, _test_cidrs) +#define mtype_test TOKEN(MTYPE, _test) +#define mtype_expire TOKEN(MTYPE, _expire) +#define mtype_resize TOKEN(MTYPE, _resize) +#define mtype_head TOKEN(MTYPE, _head) +#define mtype_list TOKEN(MTYPE, _list) +#define mtype_gc TOKEN(MTYPE, _gc) +#define mtype_variant TOKEN(MTYPE, _variant) +#define mtype_data_match TOKEN(MTYPE, _data_match) + +#ifndef HKEY_DATALEN +#define HKEY_DATALEN sizeof(struct mtype_elem) +#endif + +#define HKEY(data, initval, htable_bits) \ +(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ + & jhash_mask(htable_bits)) + +#ifndef htype +#define htype HTYPE + +/* The generic hash structure */ +struct htype { + struct htable *table; /* the hash table */ + u32 maxelem; /* max elements in the hash */ + u32 elements; /* current element (vs timeout) */ + u32 initval; /* random jhash init value */ + u32 timeout; /* timeout value, if enabled */ + size_t dsize; /* data struct size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ + struct timer_list gc; /* garbage collection when timeout enabled */ + struct mtype_elem next; /* temporary storage for uadd */ +#ifdef IP_SET_HASH_WITH_MULTI + u8 ahash_max; /* max elements in an array block */ +#endif +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; /* netmask value for subnets to store */ +#endif +#ifdef IP_SET_HASH_WITH_RBTREE + struct rb_root rbtree; +#endif +#ifdef IP_SET_HASH_WITH_NETS + struct net_prefixes nets[0]; /* book-keeping of prefixes */ +#endif +}; +#endif + +#ifdef IP_SET_HASH_WITH_NETS +/* Network cidr size book keeping when the hash stores different + * sized networks */ +static void +mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length) +{ + int i, j; + + /* Add in increasing prefix order, so larger cidr first */ + for (i = 0, j = -1; i < nets_length && h->nets[i].nets; i++) { + if (j != -1) + continue; + else if (h->nets[i].cidr < cidr) + j = i; + else if (h->nets[i].cidr == cidr) { + h->nets[i].nets++; + return; + } + } + if (j != -1) { + for (; i > j; i--) { + h->nets[i].cidr = h->nets[i - 1].cidr; + h->nets[i].nets = h->nets[i - 1].nets; + } + } + h->nets[i].cidr = cidr; + h->nets[i].nets = 1; +} + +static void +mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length) +{ + u8 i, j; + + for (i = 0; i < nets_length - 1 && h->nets[i].cidr != cidr; i++) + ; + h->nets[i].nets--; + + if (h->nets[i].nets != 0) + return; + + for (j = i; j < nets_length - 1 && h->nets[j].nets; j++) { + h->nets[j].cidr = h->nets[j + 1].cidr; + h->nets[j].nets = h->nets[j + 1].nets; + } +} +#endif + +/* Calculate the actual memory size of the set data */ +static size_t +mtype_ahash_memsize(const struct htype *h, u8 nets_length) +{ + u32 i; + struct htable *t = h->table; + size_t memsize = sizeof(*h) + + sizeof(*t) +#ifdef IP_SET_HASH_WITH_NETS + + sizeof(struct net_prefixes) * nets_length +#endif + + jhash_size(t->htable_bits) * sizeof(struct hbucket); + + for (i = 0; i < jhash_size(t->htable_bits); i++) + memsize += t->bucket[i].size * h->dsize; + + return memsize; +} + +/* Flush a hash type of set: destroy all elements */ +static void +mtype_flush(struct ip_set *set) +{ + struct htype *h = set->data; + struct htable *t = h->table; + struct hbucket *n; + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) { + n->size = n->pos = 0; + /* FIXME: use slab cache */ + kfree(n->value); + } + } +#ifdef IP_SET_HASH_WITH_NETS + memset(h->nets, 0, sizeof(struct net_prefixes) + * NETS_LENGTH(set->family)); +#endif + h->elements = 0; +} + +/* Destroy a hash type of set */ +static void +mtype_destroy(struct ip_set *set) +{ + struct htype *h = set->data; + + if (set->extensions & IPSET_EXT_TIMEOUT) + del_timer_sync(&h->gc); + + ahash_destroy(h->table); +#ifdef IP_SET_HASH_WITH_RBTREE + rbtree_destroy(&h->rbtree); +#endif + kfree(h); + + set->data = NULL; +} + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct htype *h = set->data; + + init_timer(&h->gc); + h->gc.data = (unsigned long) set; + h->gc.function = gc; + h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ; + add_timer(&h->gc); + pr_debug("gc initialized, run in every %u\n", + IPSET_GC_PERIOD(h->timeout)); +} + +static bool +mtype_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct htype *x = a->data; + const struct htype *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout && +#ifdef IP_SET_HASH_WITH_NETMASK + x->netmask == y->netmask && +#endif + a->extensions == b->extensions; +} + +/* Get the ith element from the array block n */ +#define ahash_data(n, i, dsize) \ + ((struct mtype_elem *)((n)->value + ((i) * (dsize)))) + +/* Delete expired elements from the hashtable */ +static void +mtype_expire(struct htype *h, u8 nets_length, size_t dsize) +{ + struct htable *t = h->table; + struct hbucket *n; + struct mtype_elem *data; + u32 i; + int j; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, dsize); + if (ip_set_timeout_expired(ext_timeout(data, h))) { + pr_debug("expired %u/%u\n", i, j); +#ifdef IP_SET_HASH_WITH_NETS + mtype_del_cidr(h, CIDR(data->cidr), + nets_length); +#endif + if (j != n->pos - 1) + /* Not last one */ + memcpy(data, + ahash_data(n, n->pos - 1, dsize), + dsize); + n->pos--; + h->elements--; + } + } + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!tmp) + /* Still try to delete expired elements */ + continue; + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + n->value = tmp; + } + } +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct htype *h = set->data; + + pr_debug("called\n"); + write_lock_bh(&set->lock); + mtype_expire(h, NETS_LENGTH(set->family), h->dsize); + write_unlock_bh(&set->lock); + + h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ; + add_timer(&h->gc); +} + +/* Resize a hash: create a new hash table with doubling the hashsize + * and inserting the elements to it. Repeat until we succeed or + * fail due to memory pressures. */ +static int +mtype_resize(struct ip_set *set, bool retried) +{ + struct htype *h = set->data; + struct htable *t, *orig = h->table; + u8 htable_bits = orig->htable_bits; +#ifdef IP_SET_HASH_WITH_NETS + u8 flags; +#endif + struct mtype_elem *data; + struct mtype_elem *d; + struct hbucket *n, *m; + u32 i, j; + int ret; + + /* Try to cleanup once */ + if (SET_WITH_TIMEOUT(set) && !retried) { + i = h->elements; + write_lock_bh(&set->lock); + mtype_expire(set->data, NETS_LENGTH(set->family), + h->dsize); + write_unlock_bh(&set->lock); + if (h->elements < i) + return 0; + } + +retry: + ret = 0; + htable_bits++; + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); + if (!htable_bits) { + /* In case we have plenty of memory :-) */ + pr_warning("Cannot increase the hashsize of set %s further\n", + set->name); + return -IPSET_ERR_HASH_FULL; + } + t = ip_set_alloc(sizeof(*t) + + jhash_size(htable_bits) * sizeof(struct hbucket)); + if (!t) + return -ENOMEM; + t->htable_bits = htable_bits; + + read_lock_bh(&set->lock); + for (i = 0; i < jhash_size(orig->htable_bits); i++) { + n = hbucket(orig, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + flags = 0; + mtype_data_reset_flags(data, &flags); +#endif + m = hbucket(t, HKEY(data, h->initval, htable_bits)); + ret = hbucket_elem_add(m, AHASH_MAX(h), h->dsize); + if (ret < 0) { +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(data, &flags); +#endif + read_unlock_bh(&set->lock); + ahash_destroy(t); + if (ret == -EAGAIN) + goto retry; + return ret; + } + d = ahash_data(m, m->pos++, h->dsize); + memcpy(d, data, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(d, &flags); +#endif + } + } + + rcu_assign_pointer(h->table, t); + read_unlock_bh(&set->lock); + + /* Give time to other readers of the set */ + synchronize_rcu_bh(); + + pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, + orig->htable_bits, orig, t->htable_bits, t); + ahash_destroy(orig); + + return 0; +} + +/* Add an element to a hash and update the internal counters when succeeded, + * otherwise report the proper error code. */ +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i, ret = 0; + int j = AHASH_MAX(h) + 1; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 key, multi = 0; + + if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) + /* FIXME: when set is full, we slow down here */ + mtype_expire(h, NETS_LENGTH(set->family), h->dsize); + + if (h->elements >= h->maxelem) { + if (net_ratelimit()) + pr_warning("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (mtype_data_equal(data, d, &multi)) { + if (flag_exist || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h)))) { + /* Just the extensions could be overwritten */ + j = i; + goto reuse_slot; + } else { + ret = -IPSET_ERR_EXIST; + goto out; + } + } + /* Reuse first timed out entry */ + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h)) && + j != AHASH_MAX(h) + 1) + j = i; + } +reuse_slot: + if (j != AHASH_MAX(h) + 1) { + /* Fill out reused slot */ + data = ahash_data(n, j, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_del_cidr(h, CIDR(data->cidr), NETS_LENGTH(set->family)); + mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +#endif + } else { + /* Use/create a new slot */ + TUNE_AHASH_MAX(h, multi); + ret = hbucket_elem_add(n, AHASH_MAX(h), h->dsize); + if (ret != 0) { + if (ret == -EAGAIN) + mtype_data_next(&h->next, d); + goto out; + } + data = ahash_data(n, n->pos++, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +#endif + h->elements++; + } + memcpy(data, d, sizeof(struct mtype_elem)); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_set_flags(data, flags); +#endif + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, h), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(data, h), ext); + +out: + rcu_read_unlock_bh(); + return ret; +} + +/* Delete an element from the hash: swap it with the last element + * and free up space if possible. + */ +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = h->table; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i; + u32 key, multi = 0; + + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h))) + return -IPSET_ERR_EXIST; + if (i != n->pos - 1) + /* Not last one */ + memcpy(data, ahash_data(n, n->pos - 1, h->dsize), + h->dsize); + + n->pos--; + h->elements--; +#ifdef IP_SET_HASH_WITH_NETS + mtype_del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +#endif + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * h->dsize, + GFP_ATOMIC); + if (!tmp) + return 0; + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * h->dsize); + kfree(n->value); + n->value = tmp; + } + return 0; + } + + return -IPSET_ERR_EXIST; +} + +static inline int +mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, + struct ip_set_ext *mext, struct ip_set *set, u32 flags) +{ + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(data, + (struct htype *)(set->data)), + ext, mext, flags); + return mtype_do_data_match(data); +} + +#ifdef IP_SET_HASH_WITH_NETS +/* Special test function which takes into account the different network + * sizes added to the set */ +static int +mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = h->table; + struct hbucket *n; + struct mtype_elem *data; + int i, j = 0; + u32 key, multi = 0; + u8 nets_length = NETS_LENGTH(set->family); + + pr_debug("test by nets\n"); + for (; j < nets_length && h->nets[j].nets && !multi; j++) { + mtype_data_netmask(d, h->nets[j].cidr); + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set)) { + if (!ip_set_timeout_expired( + ext_timeout(data, h))) + return mtype_data_match(data, ext, + mext, set, + flags); +#ifdef IP_SET_HASH_WITH_MULTI + multi = 0; +#endif + } else + return mtype_data_match(data, ext, + mext, set, flags); + } + } + return 0; +} +#endif + +/* Test whether the element is added to the set */ +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = h->table; + struct mtype_elem *d = value; + struct hbucket *n; + struct mtype_elem *data; + int i; + u32 key, multi = 0; + +#ifdef IP_SET_HASH_WITH_NETS + /* If we test an IP address and not a network address, + * try all possible network sizes */ + if (CIDR(d->cidr) == SET_HOST_MASK(set->family)) + return mtype_test_cidrs(set, d, ext, mext, flags); +#endif + + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (mtype_data_equal(data, d, &multi) && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h)))) + return mtype_data_match(data, ext, mext, set, flags); + } + return 0; +} + +/* Reply a HEADER request: fill out the header part of the set */ +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct htype *h = set->data; + struct nlattr *nested; + size_t memsize; + + read_lock_bh(&set->lock); + memsize = mtype_ahash_memsize(h, NETS_LENGTH(set->family)); + read_unlock_bh(&set->lock); + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, + htonl(jhash_size(h->table->htable_bits))) || + nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) + goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_NETMASK + if (h->netmask != HOST_MASK && + nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + goto nla_put_failure; +#endif + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || + ((set->extensions & IPSET_EXT_TIMEOUT) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout))) || + ((set->extensions & IPSET_EXT_COUNTER) && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, + htonl(IPSET_FLAG_WITH_COUNTERS)))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +/* Reply a LIST/SAVE request: dump the elements of the specified set */ +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct htype *h = set->data; + const struct htable *t = h->table; + struct nlattr *atd, *nested; + const struct hbucket *n; + const struct mtype_elem *e; + u32 first = cb->args[2]; + /* We assume that one hash bucket fills into one page */ + void *incomplete; + int i; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) { + incomplete = skb_tail_pointer(skb); + n = hbucket(t, cb->args[2]); + pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n); + for (i = 0; i < n->pos; i++) { + e = ahash_data(n, i, h->dsize); + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, h))) + continue; + pr_debug("list hash %lu hbucket %p i %u, data %p\n", + cb->args[2], n, i, e); + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (cb->args[2] == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_data_list(skb, e)) + goto nla_put_failure; + if (SET_WITH_TIMEOUT(set) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get( + ext_timeout(e, h))))) + goto nla_put_failure; + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, h))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nlmsg_trim(skb, incomplete); + ipset_nest_end(skb, atd); + if (unlikely(first == cb->args[2])) { + pr_warning("Can't list set %s: one bucket does not fit into " + "a message. Please report it!\n", set->name); + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static int +TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); + +static int +TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + +static const struct ip_set_type_variant mtype_variant = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .resize = mtype_resize, + .same_set = mtype_same_set, +}; + +#ifdef IP_SET_EMIT_CREATE +static int +TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u32 cadt_flags = 0; + u8 hbits; +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; +#endif + size_t hsize; + struct HTYPE *h; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; +#ifdef IP_SET_HASH_WITH_NETMASK + netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + pr_debug("Create set %s with family %s\n", + set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); +#endif + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + +#ifdef IP_SET_HASH_WITH_NETMASK + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if ((set->family == NFPROTO_IPV4 && netmask > 32) || + (set->family == NFPROTO_IPV6 && netmask > 128) || + netmask == 0) + return -IPSET_ERR_INVALID_NETMASK; + } +#endif + + hsize = sizeof(*h); +#ifdef IP_SET_HASH_WITH_NETS + hsize += sizeof(struct net_prefixes) * + (set->family == NFPROTO_IPV4 ? 32 : 128); +#endif + h = kzalloc(hsize, GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; +#ifdef IP_SET_HASH_WITH_NETMASK + h->netmask = netmask; +#endif + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + if (set->family == NFPROTO_IPV4) + set->variant = &TOKEN(HTYPE, 4_variant); + else + set->variant = &TOKEN(HTYPE, 6_variant); + + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = + ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + if (set->family == NFPROTO_IPV4) { + h->dsize = + sizeof(struct TOKEN(HTYPE, 4ct_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 4ct_elem), + timeout); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 4ct_elem), + counter); + TOKEN(HTYPE, 4_gc_init)(set, + TOKEN(HTYPE, 4_gc)); + } else { + h->dsize = + sizeof(struct TOKEN(HTYPE, 6ct_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 6ct_elem), + timeout); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 6ct_elem), + counter); + TOKEN(HTYPE, 6_gc_init)(set, + TOKEN(HTYPE, 6_gc)); + } + } else { + if (set->family == NFPROTO_IPV4) { + h->dsize = + sizeof(struct TOKEN(HTYPE, 4c_elem)); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 4c_elem), + counter); + } else { + h->dsize = + sizeof(struct TOKEN(HTYPE, 6c_elem)); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 6c_elem), + counter); + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + if (set->family == NFPROTO_IPV4) { + h->dsize = sizeof(struct TOKEN(HTYPE, 4t_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 4t_elem), + timeout); + TOKEN(HTYPE, 4_gc_init)(set, TOKEN(HTYPE, 4_gc)); + } else { + h->dsize = sizeof(struct TOKEN(HTYPE, 6t_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 6t_elem), + timeout); + TOKEN(HTYPE, 6_gc_init)(set, TOKEN(HTYPE, 6_gc)); + } + } else { + if (set->family == NFPROTO_IPV4) + h->dsize = sizeof(struct TOKEN(HTYPE, 4_elem)); + else + h->dsize = sizeof(struct TOKEN(HTYPE, 6_elem)); + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} +#endif /* IP_SET_EMIT_CREATE */ diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index b7d4cb475ae..c74e6e14cd9 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,11 +21,10 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counters support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -33,58 +32,47 @@ IP_SET_MODULE_DESC("hash:ip", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ -#define TYPE hash_ip - -static bool -hash_ip_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ip4_same_set hash_ip_same_set -#define hash_ip6_same_set hash_ip_same_set +#define HTYPE hash_ip +#define IP_SET_HASH_WITH_NETMASK -/* The type variant functions: IPv4 */ +/* IPv4 variants */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ip4_elem { + /* Zero valued IP addresses cannot be stored */ __be32 ip; }; -/* Member elements with timeout support */ -struct hash_ip4_telem { +struct hash_ip4t_elem { __be32 ip; unsigned long timeout; }; -static inline bool -hash_ip4_data_equal(const struct hash_ip4_elem *ip1, - const struct hash_ip4_elem *ip2, - u32 *multi) -{ - return ip1->ip == ip2->ip; -} +struct hash_ip4c_elem { + __be32 ip; + struct ip_set_counter counter; +}; -static inline bool -hash_ip4_data_isnull(const struct hash_ip4_elem *elem) -{ - return elem->ip == 0; -} +struct hash_ip4ct_elem { + __be32 ip; + struct ip_set_counter counter; + unsigned long timeout; +}; -static inline void -hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src) -{ - dst->ip = src->ip; -} +/* Common functions */ -/* Zero valued IP addresses cannot be stored */ -static inline void -hash_ip4_data_zero_out(struct hash_ip4_elem *elem) +static inline bool +hash_ip4_data_equal(const struct hash_ip4_elem *e1, + const struct hash_ip4_elem *e2, + u32 *multi) { - elem->ip = 0; + return e1->ip == e2->ip; } static inline bool -hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data) +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip)) + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; return 0; @@ -92,41 +80,26 @@ nla_put_failure: return 1; } -static bool -hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data) +static inline void +hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { - const struct hash_ip4_telem *tdata = - (const struct hash_ip4_telem *)data; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return 1; + next->ip = e->ip; } -#define IP_SET_HASH_WITH_NETMASK +#define MTYPE hash_ip4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d) -{ - h->next.ip = d->ip; -} +#include "ip_set_hash_gen.h" static int hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); @@ -134,43 +107,42 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, if (ip == 0) return -EINVAL; - return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); + e.ip = ip; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 ip, ip_to, hosts, timeout = h->timeout; - __be32 nip; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); + u32 ip, ip_to, hosts; int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ip &= ip_set_hostmask(h->netmask); - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - if (adt == IPSET_TEST) { - nip = htonl(ip); - if (nip == 0) + e.ip = htonl(ip); + if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; - return adtfn(set, &nip, timeout, flags); + return adtfn(set, &e, &ext, &ext, flags); } ip_to = ip; @@ -193,10 +165,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip += hosts) { - nip = htonl(ip); - if (nip == 0) + e.ip = htonl(ip); + if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; - ret = adtfn(set, &nip, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -206,29 +178,31 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ip_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout && - x->netmask == y->netmask; -} +/* Member elements */ +struct hash_ip6_elem { + union nf_inet_addr ip; +}; -/* The type variant functions: IPv6 */ +struct hash_ip6t_elem { + union nf_inet_addr ip; + unsigned long timeout; +}; -struct hash_ip6_elem { +struct hash_ip6c_elem { union nf_inet_addr ip; + struct ip_set_counter counter; }; -struct hash_ip6_telem { +struct hash_ip6ct_elem { union nf_inet_addr ip; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, @@ -237,37 +211,16 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static inline bool -hash_ip6_data_isnull(const struct hash_ip6_elem *elem) -{ - return ipv6_addr_any(&elem->ip.in6); -} - static inline void -hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src) +hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { - dst->ip.in6 = src->ip.in6; -} - -static inline void -hash_ip6_data_zero_out(struct hash_ip6_elem *elem) -{ - ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0); -} - -static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + ip6_netmask(ip, prefix); } static bool -hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data) +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6)) + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; return 0; @@ -275,69 +228,55 @@ nla_put_failure: return 1; } -static bool -hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data) +static inline void +hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) { - const struct hash_ip6_telem *e = - (const struct hash_ip6_telem *)data; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_ip6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> -static inline void -hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d) -{ -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - union nf_inet_addr ip; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip.in6); - ip6_netmask(&ip, h->netmask); - if (ipv6_addr_any(&ip.in6)) + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; - return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } -static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = { - [IPSET_ATTR_IP] = { .type = NLA_NESTED }, - [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, - [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, -}; - static int hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - union nf_inet_addr ip; - u32 timeout = h->timeout; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -345,110 +284,20 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ip6_netmask(&ip, h->netmask); - if (ipv6_addr_any(&ip.in6)) + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - ret = adtfn(set, &ip, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } -/* Create hash:ip type of sets */ - -static int -hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 netmask, hbits; - size_t hsize; - struct ip_set_hash *h; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; - pr_debug("Create set %s with family %s\n", - set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - if (tb[IPSET_ATTR_NETMASK]) { - netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - - if ((set->family == NFPROTO_IPV4 && netmask > 32) || - (set->family == NFPROTO_IPV6 && netmask > 128) || - netmask == 0) - return -IPSET_ERR_INVALID_NETMASK; - } - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - h->netmask = netmask; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ip4_tvariant : &hash_ip6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ip4_gc_init(set); - else - hash_ip6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ip4_variant : &hash_ip6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ip_type __read_mostly = { .name = "hash:ip", .protocol = IPSET_PROTOCOL, @@ -465,6 +314,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -472,6 +322,8 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index d8f77bacae8..7a2d2bd98d0 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 -#define REVISION_MAX 1 /* SCTP and UDPLITE support added */ +/* 1 SCTP and UDPLITE support added */ +#define REVISION_MAX 2 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -34,33 +34,45 @@ IP_SET_MODULE_DESC("hash:ip,port", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ -#define TYPE hash_ipport +#define HTYPE hash_ipport -static bool -hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b); +/* IPv4 variants */ -#define hash_ipport4_same_set hash_ipport_same_set -#define hash_ipport6_same_set hash_ipport_same_set +/* Member elements */ +struct hash_ipport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv4 */ +struct hash_ipport4t_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -/* Member elements without timeout */ -struct hash_ipport4_elem { +struct hash_ipport4c_elem { __be32 ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -/* Member elements with timeout support */ -struct hash_ipport4_telem { +struct hash_ipport4ct_elem { __be32 ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, @@ -71,27 +83,6 @@ hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipport4_data_copy(struct hash_ipport4_elem *dst, - const struct hash_ipport4_elem *src) -{ - dst->ip = src->ip; - dst->port = src->port; - dst->proto = src->proto; -} - -static inline void -hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) @@ -106,111 +97,91 @@ nla_put_failure: return 1; } -static bool -hash_ipport4_data_tlist(struct sk_buff *skb, - const struct hash_ipport4_elem *data) -{ - const struct hash_ipport4_telem *tdata = - (const struct hash_ipport4_telem *)data; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; -} - -#define PF 4 -#define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - static inline void -hash_ipport4_data_next(struct ip_set_hash *h, +hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { - h->next.ip = d->ip; - h->next.port = d->port; + next->ip = d->ip; + next->port = d->port; } +#define MTYPE hash_ipport4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#include "ip_set_hash_gen.h" + static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem data = { }; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem data = { }; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip, ip_to, p = 0, port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - ip_to = ip = ntohl(data.ip); + ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -225,7 +196,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -238,9 +209,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.ip = htonl(ip); - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -251,34 +222,42 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_ipport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv6 */ +struct hash_ipport6t_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -struct hash_ipport6_elem { +struct hash_ipport6c_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -struct hash_ipport6_telem { +struct hash_ipport6ct_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, @@ -289,25 +268,6 @@ hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipport6_data_copy(struct hash_ipport6_elem *dst, - const struct hash_ipport6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) @@ -322,66 +282,52 @@ nla_put_failure: return 1; } -static bool -hash_ipport6_data_tlist(struct sk_buff *skb, - const struct hash_ipport6_elem *data) +static inline void +hash_ipport6_data_next(struct hash_ipport4_elem *next, + const struct hash_ipport6_elem *d) { - const struct hash_ipport6_telem *e = - (const struct hash_ipport6_telem *)data; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_ipport6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipport6_data_next(struct ip_set_hash *h, - const struct hash_ipport6_elem *d) -{ - h->next.port = d->port; -} +#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem data = { }; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem data = { }; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; @@ -389,6 +335,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -396,39 +344,34 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -436,8 +379,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -447,78 +390,6 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipport4_tvariant : &hash_ipport6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ipport4_gc_init(set); - else - hash_ipport6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipport4_variant : &hash_ipport6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, @@ -535,6 +406,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -545,6 +417,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 1da1e955f38..34e8a1acce4 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 -#define REVISION_MAX 1 /* SCTP and UDPLITE support added */ +/* 1 SCTP and UDPLITE support added */ +#define REVISION_MAX 2 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -34,32 +34,44 @@ IP_SET_MODULE_DESC("hash:ip,port,ip", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ -#define TYPE hash_ipportip +#define HTYPE hash_ipportip -static bool -hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b); +/* IPv4 variants */ -#define hash_ipportip4_same_set hash_ipportip_same_set -#define hash_ipportip6_same_set hash_ipportip_same_set +/* Member elements */ +struct hash_ipportip4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv4 */ +struct hash_ipportip4t_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -/* Member elements without timeout */ -struct hash_ipportip4_elem { +struct hash_ipportip4c_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -/* Member elements with timeout support */ -struct hash_ipportip4_telem { +struct hash_ipportip4ct_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; @@ -74,25 +86,6 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst, - const struct hash_ipportip4_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) @@ -108,117 +101,96 @@ nla_put_failure: return 1; } -static bool -hash_ipportip4_data_tlist(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) +static inline void +hash_ipportip4_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip4_elem *d) { - const struct hash_ipportip4_telem *tdata = - (const struct hash_ipportip4_telem *)data; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP2, tdata->ip2) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; } +/* Common functions */ +#define MTYPE hash_ipportip4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportip4_data_next(struct ip_set_hash *h, - const struct hash_ipportip4_elem *d) -{ - h->next.ip = d->ip; - h->next.port = d->port; -} +#include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem data = { }; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem data = { }; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip, ip_to, p = 0, port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - ip_to = ip = ntohl(data.ip); + ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -233,7 +205,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -246,9 +218,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.ip = htonl(ip); - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -259,36 +231,46 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_ipportip6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv6 */ +struct hash_ipportip6t_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -struct hash_ipportip6_elem { +struct hash_ipportip6c_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -struct hash_ipportip6_telem { +struct hash_ipportip6ct_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, @@ -300,25 +282,6 @@ hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst, - const struct hash_ipportip6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) @@ -334,68 +297,51 @@ nla_put_failure: return 1; } -static bool -hash_ipportip6_data_tlist(struct sk_buff *skb, - const struct hash_ipportip6_elem *data) +static inline void +hash_ipportip6_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip6_elem *d) { - const struct hash_ipportip6_telem *e = - (const struct hash_ipportip6_telem *)data; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_ipportip6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportip6_data_next(struct ip_set_hash *h, - const struct hash_ipportip6_elem *d) -{ - h->next.port = d->port; -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem data = { }; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem data = { }; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; @@ -403,6 +349,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -410,43 +358,38 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -454,8 +397,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -465,78 +408,6 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ipportip4_gc_init(set); - else - hash_ipportip6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportip4_variant : &hash_ipportip6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, @@ -552,6 +423,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -563,6 +435,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index f2627226a08..c6a525373be 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,14 +21,14 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ -#define REVISION_MAX 3 /* nomatch flag support added */ +/* 3 nomatch flag support added */ +#define REVISION_MAX 4 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -36,23 +36,19 @@ IP_SET_MODULE_DESC("hash:ip,port,net", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ -#define TYPE hash_ipportnet - -static bool -hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ipportnet4_same_set hash_ipportnet_same_set -#define hash_ipportnet6_same_set hash_ipportnet_same_set - -/* The type variant functions: IPv4 */ +#define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +/* IPv4 variants */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; @@ -62,8 +58,7 @@ struct hash_ipportnet4_elem { u8 proto; }; -/* Member elements with timeout support */ -struct hash_ipportnet4_telem { +struct hash_ipportnet4t_elem { __be32 ip; __be32 ip2; __be16 port; @@ -73,6 +68,29 @@ struct hash_ipportnet4_telem { unsigned long timeout; }; +struct hash_ipportnet4c_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + struct ip_set_counter counter; +}; + +struct hash_ipportnet4ct_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + struct ip_set_counter counter; + unsigned long timeout; +}; + +/* Common functions */ + static inline bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, @@ -85,29 +103,22 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem) +static inline int +hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { - return elem->proto == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst, - const struct hash_ipportnet4_elem *src) +hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { - memcpy(dst, src, sizeof(*dst)); + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -hash_ipportnet4_data_flags(struct hash_ipportnet4_elem *dst, u32 flags) +hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - -static inline int -hash_ipportnet4_data_match(const struct hash_ipportnet4_elem *elem) -{ - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -117,12 +128,6 @@ hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) elem->cidr = cidr - 1; } -static inline void -hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) @@ -143,81 +148,56 @@ nla_put_failure: return 1; } -static bool -hash_ipportnet4_data_tlist(struct sk_buff *skb, - const struct hash_ipportnet4_elem *data) +static inline void +hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet4_elem *d) { - const struct hash_ipportnet4_telem *tdata = - (const struct hash_ipportnet4_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP2, tdata->ip2) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; + next->ip2 = d->ip2; } -#define IP_SET_HASH_WITH_PROTO -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_ipportnet4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportnet4_data_next(struct ip_set_hash *h, - const struct hash_ipportnet4_elem *d) -{ - h->next.ip = d->ip; - h->next.port = d->port; - h->next.ip2 = d->ip2; -} +#include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = { + struct hash_ipportnet4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); - data.ip2 &= ip_set_netmask(data.cidr + 1); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + e.ip2 &= ip_set_netmask(e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip, ip_to, p = 0, port, port_to; u32 ip2_from, ip2_to, ip2_last, ip2; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -226,13 +206,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -244,46 +227,41 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { - data.ip = htonl(ip); - data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr + 1)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip); + e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; @@ -301,7 +279,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -317,28 +295,27 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip2_from, ip2_to, data.cidr + 1); - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); if (retried) ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip++) { - data.ip = htonl(ip); + e.ip = htonl(ip); p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.port = htons(p); + e.port = htons(p); ip2 = retried && ip == ntohl(h->next.ip) && p == ntohs(h->next.port) ? ntohl(h->next.ip2) : ip2_from; while (!after(ip2, ip2_to)) { - data.ip2 = htonl(ip2); + e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &cidr); - data.cidr = cidr - 1; - ret = adtfn(set, &data, timeout, flags); + e.cidr = cidr - 1; + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -351,38 +328,50 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_ipportnet6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; +}; -/* The type variant functions: IPv6 */ +struct hash_ipportnet6t_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + unsigned long timeout; +}; -struct hash_ipportnet6_elem { +struct hash_ipportnet6c_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; + struct ip_set_counter counter; }; -struct hash_ipportnet6_telem { +struct hash_ipportnet6ct_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, @@ -395,44 +384,22 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst, - const struct hash_ipportnet6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportnet6_data_flags(struct hash_ipportnet6_elem *dst, u32 flags) -{ - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - static inline int -hash_ipportnet6_data_match(const struct hash_ipportnet6_elem *elem) +hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem) +hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { - elem->proto = 0; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -462,78 +429,58 @@ nla_put_failure: return 1; } -static bool -hash_ipportnet6_data_tlist(struct sk_buff *skb, - const struct hash_ipportnet6_elem *data) +static inline void +hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet6_elem *d) { - const struct hash_ipportnet6_telem *e = - (const struct hash_ipportnet6_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_ipportnet6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportnet6_data_next(struct ip_set_hash *h, - const struct hash_ipportnet6_elem *d) -{ - h->next.port = d->port; -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = { + struct hash_ipportnet6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); - ip6_netmask(&data.ip2, data.cidr + 1); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + ip6_netmask(&e.ip2, e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -543,6 +490,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -552,11 +501,12 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; @@ -564,46 +514,41 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } - ip6_netmask(&data.ip2, data.cidr + 1); + ip6_netmask(&e.ip2, e.cidr + 1); if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -611,8 +556,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -622,81 +567,6 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportnet4_tvariant - : &hash_ipportnet6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ipportnet4_gc_init(set); - else - hash_ipportnet6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportnet4_variant : &hash_ipportnet6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, @@ -713,6 +583,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -727,6 +598,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 4b677cf6bf7..da740ceb56a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,12 +20,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 Range as input support for IPv4 added */ -#define REVISION_MAX 2 /* nomatch flag support added */ +/* 2 nomatch flag support added */ +#define REVISION_MAX 3 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -33,33 +33,46 @@ IP_SET_MODULE_DESC("hash:net", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:net"); /* Type specific function prefix */ -#define TYPE hash_net +#define HTYPE hash_net +#define IP_SET_HASH_WITH_NETS -static bool -hash_net_same_set(const struct ip_set *a, const struct ip_set *b); +/* IPv4 variants */ -#define hash_net4_same_set hash_net_same_set -#define hash_net6_same_set hash_net_same_set +/* Member elements */ +struct hash_net4_elem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; -/* The type variant functions: IPv4 */ +struct hash_net4t_elem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; + unsigned long timeout; +}; -/* Member elements without timeout */ -struct hash_net4_elem { +struct hash_net4c_elem { __be32 ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; }; -/* Member elements with timeout support */ -struct hash_net4_telem { +struct hash_net4ct_elem { __be32 ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, @@ -69,31 +82,22 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_net4_data_isnull(const struct hash_net4_elem *elem) +static inline int +hash_net4_do_data_match(const struct hash_net4_elem *elem) { - return elem->cidr == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_net4_data_copy(struct hash_net4_elem *dst, - const struct hash_net4_elem *src) +hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { - dst->ip = src->ip; - dst->cidr = src->cidr; - dst->nomatch = src->nomatch; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_net4_data_flags(struct hash_net4_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - -static inline int -hash_net4_data_match(const struct hash_net4_elem *elem) +hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -103,13 +107,6 @@ hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) elem->cidr = cidr; } -/* Zero CIDR values cannot be stored */ -static inline void -hash_net4_data_zero_out(struct hash_net4_elem *elem) -{ - elem->cidr = 0; -} - static bool hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) { @@ -126,106 +123,84 @@ nla_put_failure: return 1; } -static bool -hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data) +static inline void +hash_net4_data_next(struct hash_net4_elem *next, + const struct hash_net4_elem *d) { - const struct hash_net4_telem *tdata = - (const struct hash_net4_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_u8(skb, IPSET_ATTR_CIDR, tdata->cidr) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; } -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_net4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_net4_data_next(struct ip_set_hash *h, - const struct hash_net4_elem *d) -{ - h->next.ip = d->ip; -} +#include "ip_set_hash_gen.h" static int hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { + struct hash_net4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; + struct hash_net4_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip = 0, ip_to, last; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr || data.cidr > HOST_MASK) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; @@ -241,9 +216,9 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; else @@ -253,34 +228,42 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_net_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_net6_elem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; -/* The type variant functions: IPv6 */ +struct hash_net6t_elem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; + unsigned long timeout; +}; -struct hash_net6_elem { +struct hash_net6c_elem { union nf_inet_addr ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; }; -struct hash_net6_telem { +struct hash_net6ct_elem { union nf_inet_addr ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, @@ -290,46 +273,22 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_net6_data_isnull(const struct hash_net6_elem *elem) -{ - return elem->cidr == 0; -} - -static inline void -hash_net6_data_copy(struct hash_net6_elem *dst, - const struct hash_net6_elem *src) -{ - dst->ip.in6 = src->ip.in6; - dst->cidr = src->cidr; - dst->nomatch = src->nomatch; -} - -static inline void -hash_net6_data_flags(struct hash_net6_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - static inline int -hash_net6_data_match(const struct hash_net6_elem *elem) +hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_net6_data_zero_out(struct hash_net6_elem *elem) +hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { - elem->cidr = 0; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -355,74 +314,60 @@ nla_put_failure: return 1; } -static bool -hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data) +static inline void +hash_net6_data_next(struct hash_net4_elem *next, + const struct hash_net6_elem *d) { - const struct hash_net6_telem *e = - (const struct hash_net6_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_u8(skb, IPSET_ATTR_CIDR, e->cidr) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_net6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_net6_data_next(struct ip_set_hash *h, - const struct hash_net6_elem *d) -{ -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { + struct hash_net6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; + struct hash_net6_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -430,107 +375,29 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr || data.cidr > HOST_MASK) + if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ip6_netmask(&e.ip, e.cidr); - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); - } - - ret = adtfn(set, &data, timeout, flags); - - return ip_set_eexist(ret, flags) ? 0 : ret; -} - -/* Create hash:ip type of sets */ - -static int -hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - struct ip_set_hash *h; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; + flags |= (IPSET_FLAG_NOMATCH << 16); } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + ret = adtfn(set, &e, &ext, &ext, flags); - set->variant = set->family == NFPROTO_IPV4 - ? &hash_net4_tvariant : &hash_net6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_net4_gc_init(set); - else - hash_net6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_net4_variant : &hash_net6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_net_type __read_mostly = { @@ -548,6 +415,7 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -555,6 +423,8 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 6ba985f1c96..84ae6f6ce62 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 nomatch flag support added */ -#define REVISION_MAX 2 /* /0 support added */ +/* 2 /0 support added */ +#define REVISION_MAX 3 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -127,17 +127,14 @@ iface_add(struct rb_root *root, const char **iface) } /* Type specific function prefix */ -#define TYPE hash_netiface - -static bool -hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_netiface4_same_set hash_netiface_same_set -#define hash_netiface6_same_set hash_netiface_same_set +#define HTYPE hash_netiface +#define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_RBTREE +#define IP_SET_HASH_WITH_MULTI #define STREQ(a, b) (strcmp(a, b) == 0) -/* The type variant functions: IPv4 */ +/* IPv4 variants */ struct hash_netiface4_elem_hashed { __be32 ip; @@ -147,8 +144,6 @@ struct hash_netiface4_elem_hashed { u8 elem; }; -#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) - /* Member elements without timeout */ struct hash_netiface4_elem { __be32 ip; @@ -159,17 +154,39 @@ struct hash_netiface4_elem { const char *iface; }; -/* Member elements with timeout support */ -struct hash_netiface4_telem { +struct hash_netiface4t_elem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + unsigned long timeout; +}; + +struct hash_netiface4c_elem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + struct ip_set_counter counter; +}; + +struct hash_netiface4ct_elem { __be32 ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; const char *iface; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, @@ -182,29 +199,22 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->iface == ip2->iface; } -static inline bool -hash_netiface4_data_isnull(const struct hash_netiface4_elem *elem) +static inline int +hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { - return elem->elem == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netiface4_data_copy(struct hash_netiface4_elem *dst, - const struct hash_netiface4_elem *src) +hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { - memcpy(dst, src, sizeof(*dst)); + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_netiface4_data_flags(struct hash_netiface4_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - -static inline int -hash_netiface4_data_match(const struct hash_netiface4_elem *elem) +hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -214,12 +224,6 @@ hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) elem->cidr = cidr; } -static inline void -hash_netiface4_data_zero_out(struct hash_netiface4_elem *elem) -{ - elem->elem = 0; -} - static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) @@ -240,66 +244,40 @@ nla_put_failure: return 1; } -static bool -hash_netiface4_data_tlist(struct sk_buff *skb, - const struct hash_netiface4_elem *data) +static inline void +hash_netiface4_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface4_elem *d) { - const struct hash_netiface4_telem *tdata = - (const struct hash_netiface4_telem *)data; - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - - if (data->nomatch) - flags |= IPSET_FLAG_NOMATCH; - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || - nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; } -#define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE -#define IP_SET_HASH_WITH_MULTI - +#define MTYPE hash_netiface4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netiface4_data_next(struct ip_set_hash *h, - const struct hash_netiface4_elem *d) -{ - h->next.ip = d->ip; -} +#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) +#include "ip_set_hash_gen.h" static int hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface4_elem data = { + struct hash_netiface4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK, .elem = 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); int ret; - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); #define IFACE(dir) (par->dir ? par->dir->name : NULL) #define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL) @@ -311,72 +289,69 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (!nf_bridge) return -EINVAL; - data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); - data.physdev = 1; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; #else - data.iface = NULL; + e.iface = NULL; #endif } else - data.iface = SRCDIR ? IFACE(in) : IFACE(out); + e.iface = SRCDIR ? IFACE(in) : IFACE(out); - if (!data.iface) + if (!e.iface) return -EINVAL; - ret = iface_test(&h->rbtree, &data.iface); + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } } else if (!ret) return ret; - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface4_elem data = { .cidr = HOST_MASK, .elem = 1 }; + struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip = 0, ip_to, last; - u32 timeout = h->timeout; char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (data.cidr > HOST_MASK) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - data.iface = iface; - ret = iface_test(&h->rbtree, &data.iface); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } @@ -386,14 +361,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) - data.physdev = 1; - if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH)) - flags |= (cadt_flags << 16); + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } if (tb[IPSET_ATTR_IP_TO]) { @@ -404,16 +380,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip, ip_to, data.cidr); - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr); if (retried) ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -424,18 +399,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variants */ struct hash_netiface6_elem_hashed { union nf_inet_addr ip; @@ -445,8 +409,6 @@ struct hash_netiface6_elem_hashed { u8 elem; }; -#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) - struct hash_netiface6_elem { union nf_inet_addr ip; u8 physdev; @@ -456,16 +418,39 @@ struct hash_netiface6_elem { const char *iface; }; -struct hash_netiface6_telem { +struct hash_netiface6t_elem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + unsigned long timeout; +}; + +struct hash_netiface6c_elem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + struct ip_set_counter counter; +}; + +struct hash_netiface6ct_elem { union nf_inet_addr ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; const char *iface; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, @@ -478,44 +463,22 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->iface == ip2->iface; } -static inline bool -hash_netiface6_data_isnull(const struct hash_netiface6_elem *elem) -{ - return elem->elem == 0; -} - -static inline void -hash_netiface6_data_copy(struct hash_netiface6_elem *dst, - const struct hash_netiface6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_netiface6_data_flags(struct hash_netiface6_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - static inline int -hash_netiface6_data_match(const struct hash_netiface6_elem *elem) +hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem) +hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { - elem->elem = 0; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -545,63 +508,45 @@ nla_put_failure: return 1; } -static bool -hash_netiface6_data_tlist(struct sk_buff *skb, - const struct hash_netiface6_elem *data) +static inline void +hash_netiface6_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface6_elem *d) { - const struct hash_netiface6_telem *e = - (const struct hash_netiface6_telem *)data; - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - - if (data->nomatch) - flags |= IPSET_FLAG_NOMATCH; - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || - nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_netiface6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netiface6_data_next(struct ip_set_hash *h, - const struct hash_netiface6_elem *d) -{ -} +#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface6_elem data = { + struct hash_netiface6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK, .elem = 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); int ret; - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #ifdef CONFIG_BRIDGE_NETFILTER @@ -609,44 +554,46 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (!nf_bridge) return -EINVAL; - data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); - data.physdev = 1; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; #else - data.iface = NULL; + e.iface = NULL; #endif } else - data.iface = SRCDIR ? IFACE(in) : IFACE(out); + e.iface = SRCDIR ? IFACE(in) : IFACE(out); - if (!data.iface) + if (!e.iface) return -EINVAL; - ret = iface_test(&h->rbtree, &data.iface); + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } } else if (!ret) return ret; - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface6_elem data = { .cidr = HOST_MASK, .elem = 1 }; - u32 timeout = h->timeout; + struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -654,28 +601,23 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (data.cidr > HOST_MASK) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ip6_netmask(&e.ip, e.cidr); strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - data.iface = iface; - ret = iface_test(&h->rbtree, &data.iface); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } @@ -685,90 +627,15 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) - data.physdev = 1; - if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH)) - flags |= (cadt_flags << 16); + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; -} - -/* Create hash:ip type of sets */ - -static int -hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - h->ahash_max = AHASH_MAX_SIZE; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - h->rbtree = RB_ROOT; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netiface4_tvariant : &hash_netiface6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_netiface4_gc_init(set); - else - hash_netiface6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netiface4_variant : &hash_netiface6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_netiface_type __read_mostly = { @@ -788,6 +655,7 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -798,6 +666,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index af20c0c5ced..9a0869853be 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,14 +20,14 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ -#define REVISION_MAX 3 /* nomatch flag support added */ +/* 3 nomatch flag support added */ +#define REVISION_MAX 4 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -35,15 +35,9 @@ IP_SET_MODULE_DESC("hash:net,port", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:net,port"); /* Type specific function prefix */ -#define TYPE hash_netport - -static bool -hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_netport4_same_set hash_netport_same_set -#define hash_netport6_same_set hash_netport_same_set - -/* The type variant functions: IPv4 */ +#define HTYPE hash_netport +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, @@ -51,7 +45,9 @@ hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); */ #define IP_SET_HASH_WITH_NETS_PACKED -/* Member elements without timeout */ +/* IPv4 variants */ + +/* Member elements */ struct hash_netport4_elem { __be32 ip; __be16 port; @@ -60,8 +56,7 @@ struct hash_netport4_elem { u8 nomatch:1; }; -/* Member elements with timeout support */ -struct hash_netport4_telem { +struct hash_netport4t_elem { __be32 ip; __be16 port; u8 proto; @@ -70,6 +65,27 @@ struct hash_netport4_telem { unsigned long timeout; }; +struct hash_netport4c_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + struct ip_set_counter counter; +}; + +struct hash_netport4ct_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + struct ip_set_counter counter; + unsigned long timeout; +}; + +/* Common functions */ + static inline bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, @@ -81,33 +97,22 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_netport4_data_isnull(const struct hash_netport4_elem *elem) +static inline int +hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { - return elem->proto == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netport4_data_copy(struct hash_netport4_elem *dst, - const struct hash_netport4_elem *src) +hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { - dst->ip = src->ip; - dst->port = src->port; - dst->proto = src->proto; - dst->cidr = src->cidr; - dst->nomatch = src->nomatch; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -hash_netport4_data_flags(struct hash_netport4_elem *dst, u32 flags) +hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - -static inline int -hash_netport4_data_match(const struct hash_netport4_elem *elem) -{ - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -117,12 +122,6 @@ hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) elem->cidr = cidr - 1; } -static inline void -hash_netport4_data_zero_out(struct hash_netport4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_netport4_data_list(struct sk_buff *skb, const struct hash_netport4_elem *data) @@ -142,77 +141,53 @@ nla_put_failure: return 1; } -static bool -hash_netport4_data_tlist(struct sk_buff *skb, - const struct hash_netport4_elem *data) +static inline void +hash_netport4_data_next(struct hash_netport4_elem *next, + const struct hash_netport4_elem *d) { - const struct hash_netport4_telem *tdata = - (const struct hash_netport4_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; } -#define IP_SET_HASH_WITH_PROTO -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_netport4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netport4_data_next(struct ip_set_hash *h, - const struct hash_netport4_elem *d) -{ - h->next.ip = d->ip; - h->next.port = d->port; -} +#include "ip_set_hash_gen.h" static int hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport4_elem data = { + struct hash_netport4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr + 1); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport4_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to, p = 0, ip = 0, ip_to, last; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -221,13 +196,16 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -235,47 +213,42 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr + 1)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = port_to = ntohs(data.port); + port = port_to = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port_to < port) @@ -289,21 +262,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip, ip_to, data.cidr + 1); - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr + 1); if (retried) ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); + e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &cidr); - data.cidr = cidr - 1; + e.cidr = cidr - 1; p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -315,36 +287,46 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_netport_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_netport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; +}; -/* The type variant functions: IPv6 */ +struct hash_netport6t_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + unsigned long timeout; +}; -struct hash_netport6_elem { +struct hash_netport6c_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; + struct ip_set_counter counter; }; -struct hash_netport6_telem { +struct hash_netport6ct_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, @@ -356,44 +338,22 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_netport6_data_isnull(const struct hash_netport6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_netport6_data_copy(struct hash_netport6_elem *dst, - const struct hash_netport6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_netport6_data_flags(struct hash_netport6_elem *dst, u32 flags) -{ - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - static inline int -hash_netport6_data_match(const struct hash_netport6_elem *elem) +hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netport6_data_zero_out(struct hash_netport6_elem *elem) +hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { - elem->proto = 0; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -422,76 +382,57 @@ nla_put_failure: return 1; } -static bool -hash_netport6_data_tlist(struct sk_buff *skb, - const struct hash_netport6_elem *data) +static inline void +hash_netport6_data_next(struct hash_netport4_elem *next, + const struct hash_netport6_elem *d) { - const struct hash_netport6_telem *e = - (const struct hash_netport6_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_netport6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netport6_data_next(struct ip_set_hash *h, - const struct hash_netport6_elem *d) -{ - h->next.port = d->port; -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport6_elem data = { + struct hash_netport6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr + 1); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport6_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -500,7 +441,9 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -508,7 +451,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -516,45 +460,40 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } - ip6_netmask(&data.ip, data.cidr + 1); + ip6_netmask(&e.ip, e.cidr + 1); if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -562,8 +501,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -573,80 +512,6 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netport4_tvariant : &hash_netport6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_netport4_gc_init(set); - else - hash_netport6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netport4_variant : &hash_netport6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_netport_type __read_mostly = { .name = "hash:net,port", .protocol = IPSET_PROTOCOL, @@ -663,6 +528,7 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -674,6 +540,8 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8371c2bac2e..979b8c90e42 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -13,30 +13,53 @@ #include <linux/errno.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_list.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("list:set", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_list:set"); -/* Member elements without and with timeout */ +/* Member elements */ struct set_elem { ip_set_id_t id; }; -struct set_telem { - ip_set_id_t id; +struct sett_elem { + struct { + ip_set_id_t id; + } __attribute__ ((aligned)); + unsigned long timeout; +}; + +struct setc_elem { + struct { + ip_set_id_t id; + } __attribute__ ((aligned)); + struct ip_set_counter counter; +}; + +struct setct_elem { + struct { + ip_set_id_t id; + } __attribute__ ((aligned)); + struct ip_set_counter counter; unsigned long timeout; }; +struct set_adt_elem { + ip_set_id_t id; + ip_set_id_t refid; + int before; +}; + /* Type structure */ struct list_set { size_t dsize; /* element size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ u32 size; /* size of set list array */ u32 timeout; /* timeout value */ struct timer_list gc; /* garbage collection */ @@ -49,175 +72,311 @@ list_set_elem(const struct list_set *map, u32 id) return (struct set_elem *)((void *)map->members + id * map->dsize); } -static inline struct set_telem * -list_set_telem(const struct list_set *map, u32 id) -{ - return (struct set_telem *)((void *)map->members + id * map->dsize); -} +#define ext_timeout(e, m) \ +(unsigned long *)((void *)(e) + (m)->offset[IPSET_OFFSET_TIMEOUT]) +#define ext_counter(e, m) \ +(struct ip_set_counter *)((void *)(e) + (m)->offset[IPSET_OFFSET_COUNTER]) -static inline bool -list_set_timeout(const struct list_set *map, u32 id) +static int +list_set_ktest(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { - const struct set_telem *elem = list_set_telem(map, id); + struct list_set *map = set->data; + struct set_elem *e; + u32 i, cmdflags = opt->cmdflags; + int ret; - return ip_set_timeout_test(elem->timeout); + /* Don't lookup sub-counters at all */ + opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; + if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) + opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + ret = ip_set_test(e->id, skb, par, opt); + if (ret > 0) { + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(e, map), + ext, &opt->ext, + cmdflags); + return ret; + } + } + return 0; } -static inline bool -list_set_expired(const struct list_set *map, u32 id) +static int +list_set_kadd(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { - const struct set_telem *elem = list_set_telem(map, id); + struct list_set *map = set->data; + struct set_elem *e; + u32 i; + int ret; - return ip_set_timeout_expired(elem->timeout); + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + ret = ip_set_add(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; } -/* Set list without and with timeout */ - static int -list_set_kadt(struct ip_set *set, const struct sk_buff *skb, +list_set_kdel(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; - struct set_elem *elem; + struct set_elem *e; u32 i; int ret; for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) return 0; - if (with_timeout(map->timeout) && list_set_expired(map, i)) + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) continue; - switch (adt) { - case IPSET_TEST: - ret = ip_set_test(elem->id, skb, par, opt); - if (ret > 0) - return ret; - break; - case IPSET_ADD: - ret = ip_set_add(elem->id, skb, par, opt); - if (ret == 0) - return ret; - break; - case IPSET_DEL: - ret = ip_set_del(elem->id, skb, par, opt); - if (ret == 0) - return ret; - break; - default: - break; - } + ret = ip_set_del(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; +} + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct list_set *map = set->data; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); + + switch (adt) { + case IPSET_TEST: + return list_set_ktest(set, skb, par, opt, &ext); + case IPSET_ADD: + return list_set_kadd(set, skb, par, opt, &ext); + case IPSET_DEL: + return list_set_kdel(set, skb, par, opt, &ext); + default: + break; } return -EINVAL; } static bool -id_eq(const struct list_set *map, u32 i, ip_set_id_t id) +id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) { - const struct set_elem *elem; + const struct list_set *map = set->data; + const struct set_elem *e; + + if (i >= map->size) + return 0; - if (i < map->size) { - elem = list_set_elem(map, i); - return elem->id == id; + e = list_set_elem(map, i); + return !!(e->id == id && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map)))); +} + +static int +list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, + const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(map, i); + + if (e->id != IPSET_INVALID_ID) { + if (i == map->size - 1) + /* Last element replaced: e.g. add new,before,last */ + ip_set_put_byindex(e->id); + else { + struct set_elem *x = list_set_elem(map, map->size - 1); + + /* Last element pushed off */ + if (x->id != IPSET_INVALID_ID) + ip_set_put_byindex(x->id); + memmove(list_set_elem(map, i + 1), e, + map->dsize * (map->size - (i + 1))); + } } + e->id = d->id; + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, map), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, map), ext); return 0; } -static bool -id_eq_timeout(const struct list_set *map, u32 i, ip_set_id_t id) +static int +list_set_del(struct ip_set *set, u32 i) { - const struct set_elem *elem; + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(map, i); - if (i < map->size) { - elem = list_set_elem(map, i); - return !!(elem->id == id && - !(with_timeout(map->timeout) && - list_set_expired(map, i))); - } + ip_set_put_byindex(e->id); + + if (i < map->size - 1) + memmove(e, list_set_elem(map, i + 1), + map->dsize * (map->size - (i + 1))); + /* Last element */ + e = list_set_elem(map, map->size - 1); + e->id = IPSET_INVALID_ID; return 0; } static void -list_elem_add(struct list_set *map, u32 i, ip_set_id_t id) +set_cleanup_entries(struct ip_set *set) { + struct list_set *map = set->data; struct set_elem *e; + u32 i; - for (; i < map->size; i++) { + for (i = 0; i < map->size; i++) { e = list_set_elem(map, i); - swap(e->id, id); - if (e->id == IPSET_INVALID_ID) - break; + if (e->id != IPSET_INVALID_ID && + ip_set_timeout_expired(ext_timeout(e, map))) + list_set_del(set, i); } } -static void -list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id, - unsigned long timeout) +static int +list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - struct set_telem *e; + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + u32 i; + int ret; - for (; i < map->size; i++) { - e = list_set_telem(map, i); - swap(e->id, id); - swap(e->timeout, timeout); + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); if (e->id == IPSET_INVALID_ID) - break; + return 0; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return 1; + else if (d->before > 0) + ret = id_eq(set, i + 1, d->refid); + else + ret = i > 0 && id_eq(set, i - 1, d->refid); + return ret; } + return 0; } + static int -list_set_add(struct list_set *map, u32 i, ip_set_id_t id, - unsigned long timeout) +list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - const struct set_elem *e = list_set_elem(map, i); - - if (i == map->size - 1 && e->id != IPSET_INVALID_ID) - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(e->id); - if (with_timeout(map->timeout)) - list_elem_tadd(map, i, id, ip_set_timeout_set(timeout)); - else - list_elem_add(map, i, id); + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 i, ret = 0; - return 0; -} + /* Check already added element */ + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + goto insert; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + else if (e->id != d->id) + continue; -static int -list_set_del(struct list_set *map, u32 i) -{ - struct set_elem *a = list_set_elem(map, i), *b; - - ip_set_put_byindex(a->id); - - for (; i < map->size - 1; i++) { - b = list_set_elem(map, i + 1); - a->id = b->id; - if (with_timeout(map->timeout)) - ((struct set_telem *)a)->timeout = - ((struct set_telem *)b)->timeout; - a = b; - if (a->id == IPSET_INVALID_ID) - break; + if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || + (d->before < 0 && + (i == 0 || !id_eq(set, i - 1, d->refid)))) + /* Before/after doesn't match */ + return -IPSET_ERR_REF_EXIST; + if (!flag_exist) + /* Can't re-add */ + return -IPSET_ERR_EXIST; + /* Update extensions */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, map), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, map), ext); + /* Set is already added to the list */ + ip_set_put_byindex(d->id); + return 0; } - /* Last element */ - a->id = IPSET_INVALID_ID; - return 0; +insert: + ret = -IPSET_ERR_LIST_FULL; + for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + ret = d->before != 0 ? -IPSET_ERR_REF_EXIST + : list_set_add(set, i, d, ext); + else if (e->id != d->refid) + continue; + else if (d->before > 0) + ret = list_set_add(set, i, d, ext); + else if (i + 1 < map->size) + ret = list_set_add(set, i + 1, d, ext); + } + + return ret; } -static void -cleanup_entries(struct list_set *map) +static int +list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - struct set_telem *e; + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; u32 i; for (i = 0; i < map->size; i++) { - e = list_set_telem(map, i); - if (e->id != IPSET_INVALID_ID && list_set_expired(map, i)) - list_set_del(map, i); + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + return d->before != 0 ? -IPSET_ERR_REF_EXIST + : -IPSET_ERR_EXIST; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return list_set_del(set, i); + else if (d->before > 0) { + if (!id_eq(set, i + 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + return list_set_del(set, i); + } else if (i == 0 || !id_eq(set, i - 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + else + return list_set_del(set, i); } + return -IPSET_ERR_EXIST; } static int @@ -225,26 +384,27 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct list_set *map = set->data; - bool with_timeout = with_timeout(map->timeout); - bool flag_exist = flags & IPSET_FLAG_EXIST; - int before = 0; - u32 timeout = map->timeout; - ip_set_id_t id, refid = IPSET_INVALID_ID; - const struct set_elem *elem; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); struct ip_set *s; - u32 i; int ret = 0; if (unlikely(!tb[IPSET_ATTR_NAME] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); - if (id == IPSET_INVALID_ID) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + e.id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); + if (e.id == IPSET_INVALID_ID) return -IPSET_ERR_NAME; /* "Loop detection" */ if (s->type->features & IPSET_TYPE_NAME) { @@ -254,115 +414,34 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); - before = f & IPSET_FLAG_BEFORE; + e.before = f & IPSET_FLAG_BEFORE; } - if (before && !tb[IPSET_ATTR_NAMEREF]) { + if (e.before && !tb[IPSET_ATTR_NAMEREF]) { ret = -IPSET_ERR_BEFORE; goto finish; } if (tb[IPSET_ATTR_NAMEREF]) { - refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), - &s); - if (refid == IPSET_INVALID_ID) { + e.refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), + &s); + if (e.refid == IPSET_INVALID_ID) { ret = -IPSET_ERR_NAMEREF; goto finish; } - if (!before) - before = -1; - } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout) { - ret = -IPSET_ERR_TIMEOUT; - goto finish; - } - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (!e.before) + e.before = -1; } - if (with_timeout && adt != IPSET_TEST) - cleanup_entries(map); + if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) + set_cleanup_entries(set); - switch (adt) { - case IPSET_TEST: - for (i = 0; i < map->size && !ret; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID || - (before != 0 && i + 1 >= map->size)) - break; - else if (with_timeout && list_set_expired(map, i)) - continue; - else if (before > 0 && elem->id == id) - ret = id_eq_timeout(map, i + 1, refid); - else if (before < 0 && elem->id == refid) - ret = id_eq_timeout(map, i + 1, id); - else if (before == 0 && elem->id == id) - ret = 1; - } - break; - case IPSET_ADD: - for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id != id) - continue; - if (!(with_timeout && flag_exist)) { - ret = -IPSET_ERR_EXIST; - goto finish; - } else { - struct set_telem *e = list_set_telem(map, i); - - if ((before > 1 && - !id_eq(map, i + 1, refid)) || - (before < 0 && - (i == 0 || !id_eq(map, i - 1, refid)))) { - ret = -IPSET_ERR_EXIST; - goto finish; - } - e->timeout = ip_set_timeout_set(timeout); - ip_set_put_byindex(id); - ret = 0; - goto finish; - } - } - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) - ret = before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(map, i, id, timeout); - else if (elem->id != refid) - continue; - else if (before > 0) - ret = list_set_add(map, i, id, timeout); - else if (i + 1 < map->size) - ret = list_set_add(map, i + 1, id, timeout); - } - break; - case IPSET_DEL: - ret = -IPSET_ERR_EXIST; - for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) { - ret = before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - break; - } else if (elem->id == id && - (before == 0 || - (before > 0 && id_eq(map, i + 1, refid)))) - ret = list_set_del(map, i); - else if (elem->id == refid && - before < 0 && id_eq(map, i + 1, id)) - ret = list_set_del(map, i + 1); - } - break; - default: - break; - } + ret = adtfn(set, &e, &ext, &ext, flags); finish: - if (refid != IPSET_INVALID_ID) - ip_set_put_byindex(refid); + if (e.refid != IPSET_INVALID_ID) + ip_set_put_byindex(e.refid); if (adt != IPSET_ADD || ret) - ip_set_put_byindex(id); + ip_set_put_byindex(e.id); return ip_set_eexist(ret, flags) ? 0 : ret; } @@ -371,14 +450,14 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *elem; + struct set_elem *e; u32 i; for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id != IPSET_INVALID_ID) { - ip_set_put_byindex(elem->id); - elem->id = IPSET_INVALID_ID; + e = list_set_elem(map, i); + if (e->id != IPSET_INVALID_ID) { + ip_set_put_byindex(e->id); + e->id = IPSET_INVALID_ID; } } } @@ -388,7 +467,7 @@ list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - if (with_timeout(map->timeout)) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); list_set_flush(set); kfree(map); @@ -406,8 +485,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || - (with_timeout(map->timeout) && + (SET_WITH_TIMEOUT(set) && nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) || + (SET_WITH_COUNTER(set) && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, + htonl(IPSET_FLAG_WITH_COUNTERS))) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + map->size * map->dsize))) @@ -436,7 +518,8 @@ list_set_list(const struct ip_set *set, e = list_set_elem(map, i); if (e->id == IPSET_INVALID_ID) goto finish; - if (with_timeout(map->timeout) && list_set_expired(map, i)) + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) continue; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { @@ -449,13 +532,14 @@ list_set_list(const struct ip_set *set, if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(e->id))) goto nla_put_failure; - if (with_timeout(map->timeout)) { - const struct set_telem *te = - (const struct set_telem *) e; - __be32 to = htonl(ip_set_timeout_get(te->timeout)); - if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, to)) - goto nla_put_failure; - } + if (SET_WITH_TIMEOUT(set) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get( + ext_timeout(e, map))))) + goto nla_put_failure; + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, map))) + goto nla_put_failure; ipset_nest_end(skb, nested); } finish: @@ -481,12 +565,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) const struct list_set *y = b->data; return x->size == y->size && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant list_set = { +static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, + .adt = { + [IPSET_ADD] = list_set_uadd, + [IPSET_DEL] = list_set_udel, + [IPSET_TEST] = list_set_utest, + }, .destroy = list_set_destroy, .flush = list_set_flush, .head = list_set_head, @@ -501,7 +591,7 @@ list_set_gc(unsigned long ul_set) struct list_set *map = set->data; write_lock_bh(&set->lock); - cleanup_entries(map); + set_cleanup_entries(set); write_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; @@ -509,20 +599,20 @@ list_set_gc(unsigned long ul_set) } static void -list_set_gc_init(struct ip_set *set) +list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) { struct list_set *map = set->data; init_timer(&map->gc); map->gc.data = (unsigned long) set; - map->gc.function = list_set_gc; + map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; add_timer(&map->gc); } /* Create list:set type of sets */ -static bool +static struct list_set * init_list_set(struct ip_set *set, u32 size, size_t dsize, unsigned long timeout) { @@ -532,7 +622,7 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize, map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL); if (!map) - return false; + return NULL; map->size = size; map->dsize = dsize; @@ -544,16 +634,19 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize, e->id = IPSET_INVALID_ID; } - return true; + return map; } static int list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { - u32 size = IP_SET_LIST_DEFAULT_SIZE; + struct list_set *map; + u32 size = IP_SET_LIST_DEFAULT_SIZE, cadt_flags = 0; + unsigned long timeout = 0; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_SIZE]) @@ -561,18 +654,46 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!init_list_set(set, size, sizeof(struct set_telem), - ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]))) + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (tb[IPSET_ATTR_TIMEOUT]) + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->variant = &set_variant; + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map = init_list_set(set, size, + sizeof(struct setct_elem), timeout); + if (!map) + return -ENOMEM; + set->extensions |= IPSET_EXT_TIMEOUT; + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct setct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct setct_elem, counter); + list_set_gc_init(set, list_set_gc); + } else { + map = init_list_set(set, size, + sizeof(struct setc_elem), 0); + if (!map) + return -ENOMEM; + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct setc_elem, counter); + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map = init_list_set(set, size, + sizeof(struct sett_elem), timeout); + if (!map) return -ENOMEM; - - list_set_gc_init(set); + set->extensions |= IPSET_EXT_TIMEOUT; + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct sett_elem, timeout); + list_set_gc_init(set, list_set_gc); } else { - if (!init_list_set(set, size, sizeof(struct set_elem), - IPSET_NO_TIMEOUT)) + map = init_list_set(set, size, sizeof(struct set_elem), 0); + if (!map) return -ENOMEM; } - set->variant = &list_set; return 0; } @@ -588,6 +709,7 @@ static struct ip_set_type list_set_type __read_mostly = { .create_policy = { [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_NAME] = { .type = NLA_STRING, @@ -597,6 +719,8 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 0b779d7df88..dfd7b65b3d2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -58,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) module_put(app->module); } +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} /* * Allocate/initialize app incarnation and register it in proto apps. @@ -106,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, return 0; out: - kfree(inc->timeout_table); - kfree(inc); + ip_vs_app_inc_destroy(inc); return ret; } @@ -131,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) list_del(&inc->a_list); - kfree(inc->timeout_table); - kfree(inc); + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); } @@ -144,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) { int result; - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); return result; } @@ -156,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) */ void ip_vs_app_inc_put(struct ip_vs_app *inc) { - ip_vs_app_put(inc->app); atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); } @@ -218,6 +228,7 @@ out_unlock: /* * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { @@ -341,14 +352,14 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { vseq->previous_delta = vseq->delta; vseq->delta += diff; vseq->init_seq = seq; cp->flags |= flag; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 704e514e02a..a083bda322b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly; struct ip_vs_aligned_lock { - rwlock_t l; + spinlock_t l; } __attribute__((__aligned__(SMP_CACHE_BYTES))); /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned int key) -{ - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock(unsigned int key) -{ - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned int key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned int key) -{ - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned int key) -{ - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned int key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - static inline void ct_write_lock_bh(unsigned int key) { - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } static inline void ct_write_unlock_bh(unsigned int key) { - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } @@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* Hash by protocol, client address and port */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pF\n", @@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); return ret; } @@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. + * returns bool success. Caller should hold conn reference. */ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) { @@ -230,11 +200,11 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) /* unhash it and decrease its reference counter */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - hlist_del(&cp->c_list); + hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; atomic_dec(&cp->refcnt); ret = 1; @@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) ret = 0; spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret; + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + ret = false; + /* Decrease refcnt and unlink conn only if we are last user */ + if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } else + ret = atomic_read(&cp->refcnt) ? false : true; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); return ret; } @@ -262,24 +262,25 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->cport == cp->cport && p->vport == cp->vport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); return cp; } } - ct_read_unlock(hash); + rcu_read_unlock(); return NULL; } @@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (!ip_vs_conn_net_eq(cp, p->net)) - continue; - if (p->pe_data && p->pe->ct_match) { - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) - goto out; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } continue; } @@ -363,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr is a fwmark */ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, p->vaddr, &cp->vaddr) && - p->cport == cp->cport && p->vport == cp->vport && + p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol) - goto out; + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (__ip_vs_conn_get(cp)) + goto out; + } } cp = NULL; out: - if (cp) - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -398,23 +402,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) */ hash = ip_vs_conn_hashkey_param(p, true); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->vport == cp->cport && p->cport == cp->dport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport == cp->cport && p->cport == cp->dport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); ret = cp; break; } } - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -457,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->flags & IP_VS_CONN_F_NO_CPORT) { atomic_dec(&ip_vs_conn_no_cport_cnt); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* hash on new dport */ ip_vs_conn_hash(cp); @@ -549,7 +554,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) return; /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); conn_flags = atomic_read(&dest->conn_flags); if (cp->protocol != IPPROTO_UDP) @@ -606,20 +611,22 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) * Check if there is a destination for the connection, if so * bind the connection to the destination. */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; + rcu_read_lock(); dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { struct ip_vs_proto_data *pd; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->dest) { - spin_unlock(&cp->lock); - return dest; + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; } /* Applications work depending on the forwarding method @@ -628,7 +635,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) ip_vs_unbind_app(cp); ip_vs_bind_dest(cp, dest); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* Update its packet transmitter */ cp->packet_xmit = NULL; @@ -643,7 +650,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) if (pd && atomic_read(&pd->appcnt)) ip_vs_bind_app(cp, pd->pp); } - return dest; + rcu_read_unlock(); } @@ -695,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; } - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); + ip_vs_dest_put(dest); } static int expire_quiescent_template(struct netns_ipvs *ipvs, @@ -757,41 +759,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct) * Simply decrease the refcnt of the template, * don't restart its timer. */ - atomic_dec(&ct->refcnt); + __ip_vs_conn_put(ct); return 0; } return 1; } +static void ip_vs_conn_rcu_free(struct rcu_head *head) +{ + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); + + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} + static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; struct net *net = ip_vs_conn_net(cp); struct netns_ipvs *ipvs = net_ipvs(net); - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - /* * do I control anybody? */ if (atomic_read(&cp->n_control)) goto expire_later; - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { /* delete the timer if it is activated by other users */ del_timer(&cp->timer); @@ -810,38 +807,41 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_drop_conntrack(cp); } - ip_vs_pe_put(cp->pe); - kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); return; } - /* hash it back to the table */ - ip_vs_conn_hash(cp); - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt), atomic_read(&cp->n_control)); + atomic_inc(&cp->refcnt); + cp->timeout = 60*HZ; + if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); ip_vs_conn_put(cp); } - +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); } @@ -858,7 +858,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, p->protocol); - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { IP_VS_ERR_RL("%s(): no memory\n", __func__); return NULL; @@ -869,13 +869,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_conn_net_set(cp, p->net); cp->af = p->af; cp->protocol = p->protocol; - ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; /* proto should only be IPPROTO_IP if d_addr is a fwmark */ - ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; @@ -884,6 +884,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; } spin_lock_init(&cp->lock); @@ -894,18 +898,28 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, */ atomic_set(&cp->refcnt, 1); + cp->control = NULL; atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); /* Bind the connection with a destination server */ + cp->dest = NULL; ip_vs_bind_dest(cp, dest); /* Set its state and timeout */ cp->state = 0; + cp->old_state = 0; cp->timeout = 3*HZ; cp->sync_endtime = jiffies & ~3UL; @@ -952,24 +966,29 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) struct ip_vs_iter_state *iter = seq->private; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ if (pos-- == 0) { iter->l = &ip_vs_conn_tab[idx]; return cp; } } - ct_read_unlock_bh(idx); + rcu_read_unlock(); + rcu_read_lock(); } return NULL; } static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct ip_vs_iter_state *iter = seq->private; iter->l = NULL; + rcu_read_lock(); return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } @@ -977,6 +996,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *e; struct hlist_head *l = iter->l; int idx; @@ -985,31 +1005,27 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ip_vs_conn_array(seq, 0); /* more on same hash chain? */ - if (cp->c_list.next) - return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list); + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) + return hlist_entry(e, struct ip_vs_conn, c_list); idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - while (++idx < ip_vs_conn_tab_size) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { iter->l = &ip_vs_conn_tab[idx]; return cp; } - ct_read_unlock_bh(idx); + rcu_read_unlock(); + rcu_read_lock(); } iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct ip_vs_iter_state *iter = seq->private; - struct hlist_head *l = iter->l; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); + rcu_read_unlock(); } static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) @@ -1188,7 +1204,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; /* * Randomly scan 1/32 of the whole table every second @@ -1199,9 +1215,9 @@ void ip_vs_random_dropentry(struct net *net) /* * Lock is actually needed in this loop. */ - ct_write_lock_bh(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; @@ -1228,12 +1244,15 @@ void ip_vs_random_dropentry(struct net *net) IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(hash); + rcu_read_unlock(); } } @@ -1244,7 +1263,7 @@ void ip_vs_random_dropentry(struct net *net) static void ip_vs_conn_flush(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; struct netns_ipvs *ipvs = net_ipvs(net); flush_again: @@ -1252,19 +1271,22 @@ flush_again: /* * Lock is actually needed in this loop. */ - ct_write_lock_bh(idx); + rcu_read_lock(); - hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (!ip_vs_conn_net_eq(cp, net)) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(idx); + rcu_read_unlock(); } /* the counter may be not NULL, because maybe some conn entries @@ -1331,7 +1353,7 @@ int __init ip_vs_conn_init(void) INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); } /* calculate the random value for connection hash */ @@ -1342,6 +1364,8 @@ int __init ip_vs_conn_init(void) void ip_vs_conn_cleanup(void) { + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 61f49d24171..085b5880ab0 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif -int ip_vs_net_id __read_mostly; -#ifdef IP_VS_GENERIC_NETNS -EXPORT_SYMBOL(ip_vs_net_id); -#endif +static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); @@ -206,7 +203,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, { ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, vport, p); - p->pe = svc->pe; + p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); @@ -238,7 +235,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, svc->netmask); + ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + (__force __u32) svc->netmask); else #endif snet.ip = iph->saddr.ip & svc->netmask; @@ -299,12 +297,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { + struct ip_vs_scheduler *sched; + /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -394,6 +395,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; + struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; unsigned int flags; @@ -449,7 +451,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return NULL; } - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; @@ -507,7 +510,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) { - ip_vs_service_put(svc); return NF_DROP; } @@ -533,8 +535,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - ip_vs_service_put(svc); - /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { @@ -571,12 +571,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) return NF_ACCEPT; - } - - ip_vs_service_put(svc); /* * Notify the client that the destination is unreachable, and @@ -588,9 +584,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net_ = dev_net(skb_dst(skb)->dev); - skb->dev = net->loopback_dev; + skb->dev = net_->loopback_dev; } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else @@ -643,8 +639,11 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { - int err = ip_defrag(skb, user); + int err; + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); @@ -1164,9 +1163,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(net, af, iph.protocol, - &iph.saddr, - pptr[0])) { + if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST @@ -1181,9 +1179,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - struct net *net = - dev_net(skb_dst(skb)->dev); - if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, @@ -1226,13 +1221,7 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_out(hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1259,13 +1248,7 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_out(hooknum, skb, AF_INET6); } #endif @@ -1401,10 +1384,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) goto ignore_ipip; /* Prefer the resulting PMTU */ if (dest) { - spin_lock(&dest->dst_lock); - if (dest->dst_cache) - mtu = dst_mtu(dest->dst_cache); - spin_unlock(&dest->dst_lock); + struct ip_vs_dest_dst *dest_dst; + + rcu_read_lock(); + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + rcu_read_unlock(); } if (mtu > 68 + sizeof(struct iphdr)) mtu -= sizeof(struct iphdr); @@ -1720,13 +1706,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_in(hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1785,13 +1765,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_in(hooknum, skb, AF_INET6); } #endif diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 9e2d1cccd1e..5b142fb1648 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -55,9 +55,6 @@ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG @@ -71,7 +68,7 @@ int ip_vs_get_debug_level(void) /* Protos */ -static void __ip_vs_del_service(struct ip_vs_service *svc); +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 @@ -257,9 +254,9 @@ ip_vs_use_count_dec(void) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* @@ -271,16 +268,18 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, { register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; + __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - addr_fold ^= ((size_t)net>>8); + ahash = ntohl(addr_fold); + ahash ^= ((size_t) net >> 8); - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; } /* @@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) */ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, &svc->addr, svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* Remove it from the svc_table table */ - list_del(&svc->s_list); + hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ - list_del(&svc->f_list); + hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; @@ -367,7 +366,7 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) @@ -394,7 +393,7 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(net, fwmark); - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && net_eq(svc->net, net)) { /* HIT */ @@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) return NULL; } +/* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; struct netns_ipvs *ipvs = net_ipvs(net); - read_lock(&__ip_vs_svc_lock); - /* * Check the table hashed by fwmark first */ @@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, } out: - if (svc) - atomic_inc(&svc->usecnt); - read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), @@ -469,6 +463,13 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) dest->svc = svc; } +static void ip_vs_service_free(struct ip_vs_service *svc) +{ + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); +} + static void __ip_vs_unbind_svc(struct ip_vs_dest *dest) { @@ -476,12 +477,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) dest->svc = NULL; if (atomic_dec_and_test(&svc->refcnt)) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); + ntohs(svc->port)); + ip_vs_service_free(svc); } } @@ -506,17 +506,13 @@ static inline unsigned int ip_vs_rs_hashkey(int af, & IP_VS_RTAB_MASK; } -/* - * Hashes ip_vs_dest in rs_table by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; - if (!list_empty(&dest->d_list)) { - return 0; - } + if (dest->in_rs_table) + return; /* * Hash by proto,addr,port, @@ -524,64 +520,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ipvs->rs_table[hash]); - - return 1; + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; } -/* - * UNhashes ip_vs_dest from rs_table. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ - if (!list_empty(&dest->d_list)) { - list_del_init(&dest->d_list); + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; } - - return 1; } -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) { struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash; struct ip_vs_dest *dest; - /* - * Check for "full" addressed entries - * Return the first found entry - */ + /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&ipvs->rs_lock); - list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { + rcu_read_lock(); + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { /* HIT */ - read_unlock(&ipvs->rs_lock); - return dest; + rcu_read_unlock(); + return true; } } - read_unlock(&ipvs->rs_lock); + rcu_read_unlock(); - return NULL; + return false; } -/* - * Lookup destination by {addr,port} in the given service +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, @@ -592,7 +575,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find the destination for the given service */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == svc->af) && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && (dest->port == dport)) { @@ -606,13 +589,11 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in + * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. + * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, @@ -625,7 +606,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) @@ -633,12 +614,31 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, dest = ip_vs_lookup_dest(svc, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); return dest; } +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed @@ -653,19 +653,25 @@ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { - struct ip_vs_dest *dest, *nxt; + struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); + /* We can not reuse dest while in grace period + * because conns still can use dest->svc + */ + if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) + continue; if (dest->af == svc->af && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && dest->port == dport && @@ -675,29 +681,27 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + list_del(&dest->t_list); + ip_vs_dest_hold(dest); + goto out; } } - return NULL; + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; } +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + __ip_vs_dst_cache_reset(dest); + __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); + kfree(dest); +} /* * Clean up all the destinations in the trash @@ -706,19 +710,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. + * be 0, so we simply release them here. */ static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); } } @@ -768,6 +771,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_scheduler *sched; int conn_flags; /* set the weight and the flags */ @@ -783,9 +787,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_hash(ipvs, dest); - write_unlock_bh(&ipvs->rs_lock); } atomic_set(&dest->conn_flags, conn_flags); @@ -809,27 +811,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->l_threshold = udest->l_threshold; spin_lock_bh(&dest->dst_lock); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - if (add) - ip_vs_start_estimator(svc->net, &dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - + sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { - list_add(&dest->n_list, &svc->destinations); + ip_vs_start_estimator(svc->net, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; + if (sched->add_dest) + sched->add_dest(svc, dest); + } else { + if (sched->upd_dest) + sched->upd_dest(svc, dest); } - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); } @@ -881,7 +876,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->persistconns, 0); atomic_set(&dest->refcnt, 1); - INIT_LIST_HEAD(&dest->d_list); + INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); @@ -923,10 +918,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Check if the dest already exists in the list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); @@ -948,11 +943,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - __ip_vs_update_dest(svc, dest, udest, 1); ret = 0; } else { @@ -992,10 +982,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Lookup the destination list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); @@ -1008,11 +998,21 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } +static void ip_vs_dest_wait_readers(struct rcu_head *head) +{ + struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest, + rcu_head); + + /* End of grace period after unlinking */ + clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); +} + /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, + bool cleanup) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1021,38 +1021,24 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&ipvs->rs_lock); - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", - dest->vfwmark, - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port)); - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - free_percpu(dest->stats.cpustats); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ipvs->dest_trash); - atomic_inc(&dest->refcnt); + if (!cleanup) { + set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); + call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers); } + + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + IP_VS_DEST_TRASH_PERIOD); + /* dest lives in trash without reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_dest_put(dest); } @@ -1068,14 +1054,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, /* * Remove it from the d-linked destination list. */ - list_del(&dest->n_list); + list_del_rcu(&dest->n_list); svc->num_dests--; - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched->del_dest) + sched->del_dest(svc, dest); + } } @@ -1090,37 +1078,56 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); - write_unlock_bh(&__ip_vs_svc_lock); - /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, false); LeaveFunction(2); return 0; } +static void ip_vs_dest_trash_expire(unsigned long data) +{ + struct net *net = (struct net *) data; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest *dest, *next; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + /* Skip if dest is in grace period */ + if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) + continue; + if (atomic_read(&dest->refcnt) > 0) + continue; + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + IP_VS_DEST_TRASH_PERIOD); + spin_unlock(&ipvs->dest_trash_lock); +} /* * Add a service into the service hash table @@ -1157,9 +1164,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out_err; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } } #endif @@ -1176,7 +1187,6 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 0); atomic_set(&svc->refcnt, 0); svc->af = u->af; @@ -1190,7 +1200,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->net = net; INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ @@ -1200,7 +1210,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, sched = NULL; /* Bind the ct retriever */ - ip_vs_bind_pe(svc, pe); + RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Update the virtual service counters */ @@ -1216,9 +1226,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ipvs->num_services++; /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; /* Now there is a service - full throttle */ @@ -1228,15 +1236,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, out_err: if (svc != NULL) { - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - if (svc->stats.cpustats) - free_percpu(svc->stats.cpustats); - kfree(svc); + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); @@ -1280,18 +1281,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } } #endif - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + /* Bind the new scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + old_sched = sched; + goto out; + } + /* Unbind the old scheduler on success */ + ip_vs_unbind_scheduler(svc, old_sched); + } /* * Set the flags and timeout value @@ -1300,57 +1310,30 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) + rcu_assign_pointer(svc->pe, pe); - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - old_pe = svc->pe; - if (pe != old_pe) { - ip_vs_unbind_pe(svc); - ip_vs_bind_pe(svc, pe); - } - -out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ -static void __ip_vs_del_service(struct ip_vs_service *svc) +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; @@ -1366,27 +1349,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) ip_vs_stop_estimator(svc->net, &svc->stats); /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); - /* Unbind persistence engine */ - old_pe = svc->pe; - ip_vs_unbind_pe(svc); + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); ip_vs_pe_put(old_pe); - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, cleanup); } /* @@ -1400,13 +1376,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); + ntohs(svc->port)); + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } /* decrease the module use count */ @@ -1416,23 +1391,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ -static void ip_vs_unlink_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); + __ip_vs_del_service(svc, cleanup); } /* @@ -1442,7 +1410,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, false); return 0; } @@ -1451,19 +1419,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net) +static int ip_vs_flush(struct net *net, bool cleanup) { int idx; - struct ip_vs_service *svc, *nxt; + struct ip_vs_service *svc; + struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], - s_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1471,10 +1440,10 @@ static int ip_vs_flush(struct net *net) * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1490,32 +1459,32 @@ void ip_vs_service_net_cleanup(struct net *net) EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net); + ip_vs_flush(net, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } -/* - * Release dst hold by dst_cache - */ + +/* Put all references for device (dst_cache) */ static inline void -__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { + struct ip_vs_dest_dst *dest_dst; + spin_lock_bh(&dest->dst_lock); - if (dest->dst_cache && dest->dst_cache->dev == dev) { + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } -/* - * Netdev event receiver - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to - * a device that is "unregister" it must be released. +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1527,35 +1496,37 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct ip_vs_dest *dest; unsigned int idx; - if (event != NETDEV_UNREGISTER || !ipvs) + if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); EnterFunction(2); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } } - list_for_each_entry(dest, &ipvs->dest_trash, n_list) { - __ip_vs_dev_reset(dest, dev); + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); } + spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); return NOTIFY_DONE; @@ -1568,12 +1539,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; - write_lock_bh(&__ip_vs_svc_lock); list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); return 0; } @@ -1583,14 +1552,14 @@ static int ip_vs_zero_all(struct net *net) struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } @@ -1918,7 +1887,7 @@ static struct ctl_table vs_vars[] = { struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - struct list_head *table; + struct hlist_head *table; int bucket; }; @@ -1951,7 +1920,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; @@ -1962,7 +1931,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; @@ -1975,17 +1945,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) + __acquires(RCU) { - - read_lock_bh(&__ip_vs_svc_lock); + rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *e; + struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; @@ -1998,13 +1967,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { return svc; } } @@ -2015,13 +1985,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) return svc; } @@ -2029,9 +2001,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) + __releases(RCU) { - read_unlock_bh(&__ip_vs_svc_lock); + rcu_read_unlock(); } @@ -2049,6 +2021,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -2057,18 +2030,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - svc->scheduler->name); + sched->name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - svc->scheduler->name, + sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, svc->scheduler->name, + svc->fwmark, sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2079,7 +2052,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) else seq_putc(seq, '\n'); - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, @@ -2173,7 +2146,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; - struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_stats_user rates; int i; @@ -2389,7 +2362,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ @@ -2424,11 +2397,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) } /* Lookup the exact service by <protocol, addr, port> or fwmark */ + rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2480,11 +2455,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2503,7 +2481,7 @@ __ip_vs_get_service_entries(struct net *net, int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2522,7 +2500,7 @@ __ip_vs_get_service_entries(struct net *net, } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2551,11 +2529,13 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; + rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, get->port); + rcu_read_unlock(); if (svc) { int count = 0; @@ -2738,12 +2718,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; + rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(net, AF_INET, entry->protocol, &addr, entry->port); + rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2900,6 +2882,8 @@ nla_put_failure: static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; @@ -2916,16 +2900,17 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } else { if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || - nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) goto nla_put_failure; } - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) || - (svc->pe && - nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) || + sched = rcu_dereference_protected(svc->scheduler, 1); + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || - nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) goto nla_put_failure; @@ -2971,7 +2956,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -2982,7 +2967,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -3038,15 +3023,17 @@ static int ip_vs_genl_parse_service(struct net *net, } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); + usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } + rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, &usvc->addr, usvc->port); + rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ @@ -3076,7 +3063,7 @@ static int ip_vs_genl_parse_service(struct net *net, usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); + usvc->netmask = nla_get_be32(nla_netmask); } return 0; @@ -3102,7 +3089,7 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) return -EMSGSIZE; if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || - nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, (atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK)) || @@ -3211,7 +3198,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); + udest->port = nla_get_be16(nla_port); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -3236,8 +3223,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, return 0; } -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid) { struct nlattr *nl_daemon; @@ -3258,8 +3245,8 @@ nla_put_failure: return -EMSGSIZE; } -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid, struct netlink_callback *cb) { void *hdr; @@ -3398,7 +3385,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(net, info->attrs); @@ -3790,13 +3777,14 @@ int __net_init ip_vs_control_net_init(struct net *net) int idx; struct netns_ipvs *ipvs = net_ipvs(net); - rwlock_init(&ipvs->rs_lock); - /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) - INIT_LIST_HEAD(&ipvs->rs_table[idx]); + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, + (unsigned long) net); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); @@ -3826,6 +3814,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + /* Some dest can be in grace period even before cleanup, we have to + * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called. + */ + rcu_barrier(); ip_vs_trash_cleanup(net); ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); @@ -3871,10 +3863,10 @@ int __init ip_vs_control_init(void) EnterFunction(2); - /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ + /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); } smp_wmb(); /* Do we really need it now ? */ diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 7f3b0cc00b7..ccab120df45 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -51,7 +51,7 @@ * IPVS DH bucket */ struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -64,6 +64,10 @@ struct ip_vs_dh_bucket { #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS DH entry @@ -85,10 +89,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } @@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); p = p->next; } @@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -145,51 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_KERNEL); - if (tbl == NULL) + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - svc->sched_data = tbl; + svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); return 0; } -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; + struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + ip_vs_dh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; } -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + ip_vs_dh_reassign(s, svc); return 0; } @@ -212,19 +217,20 @@ static struct ip_vs_dest * ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; struct ip_vs_iphdr iph; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph.daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } @@ -248,7 +254,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; @@ -262,6 +269,7 @@ static int __init ip_vs_dh_init(void) static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 0fac6017b6f..6bee6d0c73a 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -56,7 +56,7 @@ * Make a summary from each cpu */ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, - struct ip_vs_cpu_stats *stats) + struct ip_vs_cpu_stats __percpu *stats) { int i; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 4f53a5f0443..77c173282f3 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -267,10 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * hopefully it will succeed on the retransmitted * packet. */ + rcu_read_lock(); ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, iph->ihl * 4, start-data, end-start, buf, buf_len); + rcu_read_unlock(); if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); @@ -480,6 +482,7 @@ static int __init ip_vs_ftp_init(void) int rv; rv = register_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns on error */ return rv; } @@ -489,6 +492,7 @@ static int __init ip_vs_ftp_init(void) static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index fdd89b9564e..5ea26bd8774 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -90,11 +90,12 @@ * IP address and its destination server */ struct ip_vs_lblc_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -102,12 +103,14 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -129,13 +132,16 @@ static ctl_table vs_vars_table[] = { static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { - list_del(&en->list); + struct ip_vs_dest *dest; + + hlist_del_rcu(&en->list); /* * We don't kfree dest because it is referred either by its service * or the trash dest list. */ - atomic_dec(&en->dest->refcnt); - kfree(en); + dest = rcu_dereference_protected(en->dest, 1); + ip_vs_dest_put(dest); + kfree_rcu(en, rcu_head); } @@ -165,15 +171,12 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ +/* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) @@ -181,7 +184,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -191,7 +194,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. + * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, @@ -209,14 +212,20 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, ip_vs_addr_copy(dest->af, &en->addr, daddr); en->lastuse = jiffies; - atomic_inc(&dest->refcnt); - en->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(en->dest, dest); ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; + } else { + struct ip_vs_dest *old_dest; + + old_dest = rcu_dereference_protected(en->dest, 1); + if (old_dest != dest) { + ip_vs_dest_put(old_dest); + ip_vs_dest_hold(dest); + /* No ordering constraints for refcnt */ + RCU_INIT_POINTER(en->dest, dest); + } } return en; @@ -226,17 +235,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +static void ip_vs_lblc_flush(struct ip_vs_service *svc) { - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; int i; + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblc_expiration(struct ip_vs_service *svc) @@ -252,15 +266,16 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc) static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; unsigned long now = jiffies; int i, j; for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc))) @@ -269,7 +284,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -293,7 +308,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -314,8 +330,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; @@ -323,7 +339,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -354,11 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) * Initialize the hash buckets */ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -371,7 +388,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; @@ -379,14 +396,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); + ip_vs_lblc_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -408,7 +423,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { @@ -423,7 +438,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -457,7 +472,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -484,7 +499,6 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); if (en) { /* We only hold a read lock, but this is atomic */ @@ -499,14 +513,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * free up entries from the trash at any time. */ - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; + dest = rcu_dereference(en->dest); + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); @@ -516,9 +527,10 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph.daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", @@ -621,6 +633,7 @@ static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); unregister_pernet_subsys(&ip_vs_lblc_ops); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c03b6a3ade2..50123c2ab48 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -89,40 +89,44 @@ */ struct ip_vs_dest_set_elem { struct list_head list; /* list link */ - struct ip_vs_dest *dest; /* destination server */ + struct ip_vs_dest __rcu *dest; /* destination server */ + struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ struct list_head list; /* destination list */ - rwlock_t lock; /* lock for this list */ }; -static struct ip_vs_dest_set_elem * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) { struct ip_vs_dest_set_elem *e; - list_for_each_entry(e, &set->list, list) { - if (e->dest == dest) - /* already existed */ - return NULL; + if (check) { + list_for_each_entry(e, &set->list, list) { + struct ip_vs_dest *d; + + d = rcu_dereference_protected(e->dest, 1); + if (d == dest) + /* already existed */ + return; + } } e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) - return NULL; + return; - atomic_inc(&dest->refcnt); - e->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(e->dest, dest); - list_add(&e->list, &set->list); + list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; - return e; } static void @@ -131,13 +135,16 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) struct ip_vs_dest_set_elem *e; list_for_each_entry(e, &set->list, list) { - if (e->dest == dest) { + struct ip_vs_dest *d; + + d = rcu_dereference_protected(e->dest, 1); + if (d == dest) { /* HIT */ atomic_dec(&set->size); set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + ip_vs_dest_put(dest); + list_del_rcu(&e->list); + kfree_rcu(e, rcu_head); break; } } @@ -147,17 +154,18 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e, *ep; - write_lock(&set->lock); list_for_each_entry_safe(e, ep, &set->list, list) { + struct ip_vs_dest *d; + + d = rcu_dereference_protected(e->dest, 1); /* * We don't kfree dest because it is referred either * by its service or by the trash dest list. */ - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + ip_vs_dest_put(d); + list_del_rcu(&e->list); + kfree_rcu(e, rcu_head); } - write_unlock(&set->lock); } /* get weighted least-connection node in the destination set */ @@ -171,8 +179,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) return NULL; /* select the first destination server, whose weight > 0 */ - list_for_each_entry(e, &set->list, list) { - least = e->dest; + list_for_each_entry_rcu(e, &set->list, list) { + least = rcu_dereference(e->dest); if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -186,8 +194,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* find the destination with the weighted least load */ nextstage: - list_for_each_entry(e, &set->list, list) { - dest = e->dest; + list_for_each_entry_continue_rcu(e, &set->list, list) { + dest = rcu_dereference(e->dest); if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -224,7 +232,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* select the first destination server, whose weight > 0 */ list_for_each_entry(e, &set->list, list) { - most = e->dest; + most = rcu_dereference_protected(e->dest, 1); if (atomic_read(&most->weight) > 0) { moh = ip_vs_dest_conn_overhead(most); goto nextstage; @@ -234,8 +242,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* find the destination with the weighted most load */ nextstage: - list_for_each_entry(e, &set->list, list) { - dest = e->dest; + list_for_each_entry_continue(e, &set->list, list) { + dest = rcu_dereference_protected(e->dest, 1); doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ if ((moh * atomic_read(&dest->weight) < @@ -262,11 +270,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) * IP address and its destination server set */ struct ip_vs_lblcr_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -274,12 +283,14 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -302,9 +313,9 @@ static ctl_table vs_vars_table[] = { static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { - list_del(&en->list); + hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); - kfree(en); + kfree_rcu(en, rcu_head); } @@ -334,15 +345,12 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) @@ -350,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -360,7 +368,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. + * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, @@ -381,14 +389,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* initialize its dest set */ atomic_set(&(en->set.size), 0); INIT_LIST_HEAD(&en->set.list); - rwlock_init(&en->set.lock); + + ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); + return en; } - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest, true); return en; } @@ -397,17 +405,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - /* No locking required, only called during cleanup. */ + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblcr_expiration(struct ip_vs_service *svc) @@ -425,13 +437,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_after(en->lastuse + sysctl_lblcr_expiration(svc), now)) continue; @@ -439,7 +452,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -463,7 +476,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -484,8 +498,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -493,7 +507,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -523,11 +537,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) * Initialize the hash buckets */ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -540,7 +555,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; @@ -548,14 +563,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); + ip_vs_lblcr_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -577,7 +590,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -593,7 +606,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -627,7 +640,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -646,7 +659,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_lblcr_table *tbl = svc->sched_data; struct ip_vs_iphdr iph; - struct ip_vs_dest *dest = NULL; + struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); @@ -654,53 +667,46 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); if (en) { - /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* Get the least loaded destination */ - read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + + time_after(jiffies, en->set.lastmod + sysctl_lblcr_expiration(svc))) { - struct ip_vs_dest *m; + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); + if (dest && !is_overloaded(dest, svc)) goto out; - } /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); - read_unlock(&svc->sched_lock); return NULL; } /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); goto out; + } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); @@ -710,9 +716,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph.daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", @@ -814,6 +821,7 @@ static void __exit ip_vs_lblcr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); unregister_pernet_subsys(&ip_vs_lblcr_ops); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index f391819c0cc..5128e338a74 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -42,7 +42,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * served, but no new connection is assigned to the server. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->weight) == 0) continue; @@ -84,6 +84,7 @@ static int __init ip_vs_lc_init(void) static void __exit ip_vs_lc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); } module_init(ip_vs_lc_init); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 984d9c137d8..646cfd4baa7 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -75,7 +75,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD || !atomic_read(&dest->weight)) @@ -133,6 +133,7 @@ static int __init ip_vs_nq_init(void) static void __exit ip_vs_nq_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); } module_init(ip_vs_nq_init); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 5cf859ccb31..1a82b29ce8e 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -13,20 +13,8 @@ /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_pe_lock); - -/* Bind a service with a pe */ -void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) -{ - svc->pe = pe; -} - -/* Unbind a service from its pe */ -void ip_vs_unbind_pe(struct ip_vs_service *svc) -{ - svc->pe = NULL; -} +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) @@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); - spin_lock_bh(&ip_vs_pe_lock); - - list_for_each_entry(pe, &ip_vs_pe, n_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { @@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) } if (strcmp(pe_name, pe->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_pe_lock); + rcu_read_unlock(); return pe; } if (pe->module) module_put(pe->module); } + rcu_read_unlock(); - spin_unlock_bh(&ip_vs_pe_lock); return NULL; } @@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_pe_lock); - - if (!list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - ip_vs_use_count_dec(); - pr_err("%s(): [%s] pe already linked\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { - spin_unlock_bh(&ip_vs_pe_lock); + mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); @@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) } } /* Add it into the d-linked pe list */ - list_add(&pe->n_list, &ip_vs_pe); - spin_unlock_bh(&ip_vs_pe_lock); + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); @@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { - spin_lock_bh(&ip_vs_pe_lock); - if (list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - pr_err("%s(): [%s] pe is not in the list. failed\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ - list_del(&pe->n_list); - spin_unlock_bh(&ip_vs_pe_lock); + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 12475ef88da..9ef22bdce9f 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, const char *callid, size_t callid_len, int *idx) { - size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); memcpy(buf + *idx, callid, len); buf[*idx+len] = '\0'; *idx += len + 1; @@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff, if (ret > 0) break; if (!ret) - return 0; + return -EINVAL; dataoff += *matchoff; } - /* Empty callid is useless */ - if (!*matchlen) - return -EINVAL; - /* Too large is useless */ if (*matchlen > IP_VS_PEDATA_MAXLEN) return -EINVAL; @@ -172,6 +169,7 @@ static int __init ip_vs_sip_init(void) static void __exit ip_vs_sip_cleanup(void) { unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); } module_init(ip_vs_sip_init); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index cd1d7298f7b..86464881cd2 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -27,9 +27,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (sch == NULL) return 0; net = skb_net(skb); + rcu_read_lock(); if ((sch->type == SCTP_CID_INIT) && - (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol, - &iph->daddr, sh->dest))) { + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, sh->dest))) { int ignored; if (ip_vs_todrop(net_ipvs(net))) { @@ -37,7 +38,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -49,14 +50,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); - else { - ip_vs_service_put(svc); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -208,7 +208,7 @@ enum ipvs_sctp_event_t { IP_VS_SCTP_EVE_LAST }; -static enum ipvs_sctp_event_t sctp_events[255] = { +static enum ipvs_sctp_event_t sctp_events[256] = { IP_VS_SCTP_EVE_DATA_CLI, IP_VS_SCTP_EVE_INIT_CLI, IP_VS_SCTP_EVE_INIT_ACK_CLI, @@ -994,9 +994,9 @@ static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_sctp_state(pd, cp, direction, skb); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) @@ -1016,30 +1016,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc) hash = sctp_app_hashkey(port); - spin_lock_bh(&ipvs->sctp_app_lock); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->sctp_app_lock); return ret; } static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); - spin_lock_bh(&ipvs->sctp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->sctp_app_lock); + list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) @@ -1055,12 +1050,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - spin_lock(&ipvs->sctp_app_lock); - list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -1076,7 +1071,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); out: return result; } @@ -1090,7 +1085,6 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); - spin_lock_init(&ipvs->sctp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); if (!pd->timeout_table) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 9af653a7582..50a15944c6c 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -47,9 +47,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, } net = skb_net(skb); /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + rcu_read_lock(); if (th->syn && - (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol, - &iph->daddr, th->dest))) { + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, th->dest))) { int ignored; if (ip_vs_todrop(net_ipvs(net))) { @@ -57,7 +58,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -70,14 +71,13 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); - else { - ip_vs_service_put(svc); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -557,9 +557,9 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, if (th == NULL) return; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) @@ -580,18 +580,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) hash = tcp_app_hashkey(port); - spin_lock_bh(&ipvs->tcp_app_lock); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->tcp_app_lock); return ret; } @@ -599,13 +597,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock_bh(&ipvs->tcp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->tcp_app_lock); + list_del_rcu(&inc->p_list); } @@ -624,12 +619,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&ipvs->tcp_app_lock); - list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -646,7 +641,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); out: return result; @@ -660,11 +655,11 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } /* --------------------------------------------- @@ -676,7 +671,6 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); - spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 503a842c90d..b62a3c0ff9b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -44,8 +44,9 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, return 0; } net = skb_net(skb); - svc = ip_vs_service_get(net, af, skb->mark, iph->protocol, - &iph->daddr, uh->dest); + rcu_read_lock(); + svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, uh->dest); if (svc) { int ignored; @@ -54,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -67,14 +68,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); - else { - ip_vs_service_put(svc); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -359,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) hash = udp_app_hashkey(port); - - spin_lock_bh(&ipvs->udp_app_lock); list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->udp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->udp_app_lock); return ret; } @@ -380,12 +377,9 @@ static void udp_unregister_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); - struct netns_ipvs *ipvs = net_ipvs(net); - spin_lock_bh(&ipvs->udp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->udp_app_lock); + list_del_rcu(&inc->p_list); } @@ -403,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&ipvs->udp_app_lock); - list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -425,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); out: return result; @@ -467,7 +461,6 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); - spin_lock_init(&ipvs->udp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); if (!pd->timeout_table) diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index c49b388d108..c35986c793d 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) { - svc->sched_data = &svc->destinations; + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -48,36 +57,41 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct list_head *p, *q; - struct ip_vs_dest *dest; + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), @@ -94,7 +108,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, .schedule = ip_vs_rr_schedule, }; @@ -106,6 +121,7 @@ static int __init ip_vs_rr_init(void) static void __exit ip_vs_rr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); } module_init(ip_vs_rr_init); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index d6bf20d6cdb..4dbcda6258b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err); */ static LIST_HEAD(ip_vs_schedulers); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_sched_lock); +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); /* @@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, { int ret; - svc->scheduler = scheduler; - if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { @@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, return ret; } } - + rcu_assign_pointer(svc->scheduler, scheduler); return 0; } @@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, /* * Unbind a service with its scheduler */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) { - struct ip_vs_scheduler *sched = svc->scheduler; + struct ip_vs_scheduler *cur_sched; - if (!sched) - return 0; + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - pr_err("%s(): done error\n", __func__); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can not be set to NULL */ } @@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return sched; } if (sched->module) module_put(sched->module); } - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return NULL; } @@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - svc->scheduler->name, svc->fwmark, - svc->fwmark, msg); + sched->name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } @@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (!list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 89ead246ed3..f3205925359 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -79,7 +79,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -94,7 +94,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); @@ -134,6 +134,7 @@ static int __init ip_vs_sed_init(void) static void __exit ip_vs_sed_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); } module_init(ip_vs_sed_init); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e3312699462..0df269d7c99 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -53,7 +53,7 @@ * IPVS SH bucket */ struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -66,6 +66,10 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) +struct ip_vs_sh_state { + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS SH entry @@ -87,10 +91,9 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest); } @@ -98,27 +101,32 @@ ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; int d_count; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", i, IP_VS_DBG_ADDR(svc->af, &dest->addr), @@ -140,16 +148,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -158,51 +168,46 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_KERNEL); - if (tbl == NULL) + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - svc->sched_data = tbl; + svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); return 0; } -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; + struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + ip_vs_sh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; } -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + ip_vs_sh_reassign(s, svc); return 0; } @@ -225,15 +230,15 @@ static struct ip_vs_dest * ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; struct ip_vs_iphdr iph; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); + s = (struct ip_vs_sh_state *) svc->sched_data; + dest = ip_vs_sh_get(svc->af, s, &iph.saddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 @@ -262,7 +267,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; @@ -276,6 +283,7 @@ static int __init ip_vs_sh_init(void) static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 44fd10c539a..f6046d9af8d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -246,7 +246,7 @@ struct ip_vs_sync_thread_data { struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; - __u16 size; + __be16 size; /* ip_vs_sync_conn entries start here */ }; @@ -255,7 +255,7 @@ struct ip_vs_sync_mesg_v0 { struct ip_vs_sync_mesg { __u8 reserved; /* must be zero */ __u8 syncid; - __u16 size; + __be16 size; __u8 nr_conns; __s8 version; /* SYNC_PROTO_VER */ __u16 spare; @@ -335,7 +335,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs) sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ipvs->master_syncid; - sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); @@ -418,7 +418,7 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; mesg->syncid = ipvs->master_syncid; - mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; sb->firstuse = jiffies; @@ -531,9 +531,9 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) return; - spin_lock(&ipvs->sync_buff_lock); + spin_lock_bh(&ipvs->sync_buff_lock); if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); return; } @@ -552,7 +552,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, if (!buff) { buff = ip_vs_sync_buff_create_v0(ipvs); if (!buff) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } @@ -582,7 +582,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, } m->nr_conns++; - m->size += len; + m->size = htons(ntohs(m->size) + len); buff->head += len; /* check if there is a space for next one */ @@ -590,7 +590,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, sb_queue_tail(ipvs, ms); ms->sync_buff = NULL; } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ cp = cp->control; @@ -641,9 +641,9 @@ sloop: pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); } - spin_lock(&ipvs->sync_buff_lock); + spin_lock_bh(&ipvs->sync_buff_lock); if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); return; } @@ -683,7 +683,7 @@ sloop: if (!buff) { buff = ip_vs_sync_buff_create(ipvs); if (!buff) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } @@ -693,7 +693,7 @@ sloop: p = buff->head; buff->head += pad + len; - m->size += pad + len; + m->size = htons(ntohs(m->size) + pad + len); /* Add ev. padding from prev. sync_conn */ while (pad--) *(p++) = 0; @@ -750,7 +750,7 @@ sloop: } } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); control: /* synchronize its controller if it has */ @@ -843,7 +843,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, kfree(param->pe_data); dest = cp->dest; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { if (flags & IP_VS_CONN_F_INACTIVE) { @@ -857,24 +857,21 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; cp->flags = flags; - spin_unlock(&cp->lock); - if (!dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); } else { /* * Find the appropriate destination for the connection. * If it is not found the connection will remain unbound * but still handled. */ + rcu_read_lock(); dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); - if (dest) - atomic_dec(&dest->refcnt); + rcu_read_unlock(); if (!cp) { if (param->pe_data) kfree(param->pe_data); @@ -1178,10 +1175,8 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, IP_VS_DBG(2, "BACKUP, message header too short\n"); return; } - /* Convert size back to host byte order */ - m2->size = ntohs(m2->size); - if (buflen != m2->size) { + if (buflen != ntohs(m2->size)) { IP_VS_DBG(2, "BACKUP, bogus message size\n"); return; } @@ -1547,10 +1542,7 @@ ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) int msize; int ret; - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); + msize = ntohs(msg->size); ret = ip_vs_send_async(sock, (char *)msg, msize); if (ret >= 0 || ret == -EAGAIN) @@ -1692,11 +1684,7 @@ static int sync_thread_backup(void *data) break; } - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); ip_vs_process_message(tinfo->net, tinfo->buf, len); - local_bh_enable(); } } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index bc1bfc48a17..c60a81c4ce9 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -51,7 +51,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -66,7 +66,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); @@ -106,6 +106,7 @@ static int __init ip_vs_wlc_init(void) static void __exit ip_vs_wlc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); } module_init(ip_vs_wlc_init); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 231be7dd547..0e68555bceb 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -29,14 +29,45 @@ #include <net/ip_vs.h> +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + /* * current destination pointer for weighted round-robin scheduling */ struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ + struct ip_vs_dest *cl; /* current dest or head */ int cw; /* current weight */ int mw; /* maximum weight */ int di; /* decreasing interval */ + struct rcu_head rcu_head; }; @@ -88,36 +119,41 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) if (mark == NULL) return -ENOMEM; - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; svc->sched_data = mark; return 0; } -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) { + struct ip_vs_wrr_mark *mark = svc->sched_data; + /* * Release the mark variable */ - kfree(svc->sched_data); - - return 0; + kfree_rcu(mark, rcu_head); } -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { struct ip_vs_wrr_mark *mark = svc->sched_data; - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -128,80 +164,79 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct ip_vs_dest *dest; + struct ip_vs_dest *dest, *last, *stop = NULL; struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; + bool last_pass = false, restarted = false; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - ip_vs_scheduler_err(svc, - "no destination available: " - "no destinations present"); - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - ip_vs_scheduler_err(svc, - "no destination available"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - ip_vs_scheduler_err(svc, - "no destination available: " - "all destinations are overloaded"); - goto out; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; } } +found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + mark->cl = dest; out: - write_unlock(&svc->sched_lock); + spin_unlock_bh(&svc->sched_lock); return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; } @@ -212,7 +247,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, .schedule = ip_vs_wrr_schedule, }; @@ -224,6 +261,7 @@ static int __init ip_vs_wrr_init(void) static void __exit ip_vs_wrr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); } module_init(ip_vs_wrr_init); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index ee6b7a9f1ec..b75ff6429a0 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -17,6 +17,8 @@ * - not all connections have destination server, for example, * connections in backup server when fwmark is used * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback * LOCAL_OUT rules: * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) * - skb->pkt_type is not set yet @@ -51,39 +53,54 @@ enum { */ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ }; +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} + /* * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, - u32 dst_cookie) +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) { - struct dst_entry *old_dst; + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dest->dst_cookie = dst_cookie; - dst_release(old_dst); + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) { - struct dst_entry *dst = dest->dst_cache; + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; - if (!dst) + if (!dest_dst) return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && - dst->ops->check(dst, dest->dst_cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; - } - dst_hold(dst); - return dst; + return dest_dst; } static inline bool @@ -104,7 +121,7 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) /* Get route to daddr, update *saddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - u32 rtos, int rt_mode, __be32 *saddr) + int rt_mode, __be32 *saddr) { struct flowi4 fl4; struct rtable *rt; @@ -113,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; - fl4.flowi4_tos = rtos; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; @@ -124,7 +140,7 @@ retry: if (PTR_ERR(rt) == -EINVAL && *saddr && rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { *saddr = 0; - flowi4_update_output(&fl4, 0, rtos, daddr, 0); + flowi4_update_output(&fl4, 0, 0, daddr, 0); goto retry; } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); @@ -132,7 +148,7 @@ retry: } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr); + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); loop++; goto retry; } @@ -141,113 +157,140 @@ retry: } /* Get route to destination or remote server */ -static struct rtable * +static int __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr) + __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ struct rtable *ort; /* Original route */ - int local; + struct iphdr *iph; + __be16 df; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos))) { - rt = do_output_route4(net, dest->addr.ip, rtos, - rt_mode, &dest->dst_saddr.ip); + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); if (!rt) { - spin_unlock(&dest->dst_lock); - return NULL; + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); - IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " - "rtos=%X\n", - &dest->addr.ip, &dest->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt), rtos); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); } daddr = dest->addr.ip; if (ret_saddr) - *ret_saddr = dest->dst_saddr.ip; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.ip; } else { __be32 saddr = htonl(INADDR_ANY); + noref = 0; + /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, &saddr); if (!rt) - return NULL; + goto err_unreach; if (ret_saddr) *ret_saddr = saddr; } - local = rt->rt_flags & RTCF_LOCAL; + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", (rt->rt_flags & RTCF_LOCAL) ? "local":"non-local", &daddr); - ip_rt_put(rt); - return NULL; - } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " - "requires NAT method, dest: %pI4\n", - &ip_hdr(skb)->daddr, &daddr); - ip_rt_put(rt); - return NULL; + goto err_put; } - if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " - "to non-local address, dest: %pI4\n", - &ip_hdr(skb)->saddr, &daddr); - ip_rt_put(rt); - return NULL; + iph = ip_hdr(skb); + if (likely(!local)) { + if (unlikely(ipv4_is_loopback(iph->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI4 to non-local address, dest: %pI4\n", + &iph->saddr, &daddr); + goto err_put; + } + } else { + ort = skb_rtable(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !(ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to " + "local requires NAT method, dest: %pI4\n", + &iph->daddr, &daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; } - return rt; -} - -/* Reroute packet to local IPv4 stack after DNAT */ -static int -__ip_vs_reroute_locally(struct sk_buff *skb) -{ - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - struct net *net = dev_net(dev); - struct iphdr *iph = ip_hdr(skb); - - if (rt_is_input_route(rt)) { - unsigned long orefdst = skb->_skb_refdst; - - if (ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) - return 0; - refdst_drop(orefdst); + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + df = iph->frag_off & htons(IP_DF); } else { - struct flowi4 fl4 = { - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), - .flowi4_mark = skb->mark, - }; - - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return 0; - if (!(rt->rt_flags & RTCF_LOCAL)) { - ip_rt_put(rt); - return 0; + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; } - /* Drop old route. */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ort = skb_rtable(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + /* MTU check allowed? */ + df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } - return 1; + + /* MTU checking */ + if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #ifdef CONFIG_IP_VS_IPV6 @@ -294,44 +337,57 @@ out_err: /* * Get route to destination or remote server */ -static struct rt6_info * +static int __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, - int do_xfrm, int rt_mode) + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { struct net *net = dev_net(skb_dst(skb)->dev); + struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct rt6_info *ort; /* Original route */ struct dst_entry *dst; - int local; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); - if (!rt) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { u32 cookie; + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, - &dest->dst_saddr.in6, + &dest_dst->dst_saddr.in6, do_xfrm); if (!dst) { - spin_unlock(&dest->dst_lock); - return NULL; + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } rt = (struct rt6_info *) dst; cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", - &dest->addr.in6, &dest->dst_saddr.in6, + &dest->addr.in6, &dest_dst->dst_saddr.in6, atomic_read(&rt->dst.__refcnt)); } if (ret_saddr) - *ret_saddr = dest->dst_saddr.in6; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.in6; } else { + noref = 0; dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) - return NULL; + goto err_unreach; rt = (struct rt6_info *) dst; } @@ -340,86 +396,137 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", local ? "local":"non-local", daddr); - dst_release(&rt->dst); - return NULL; + goto err_put; } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = (struct rt6_info *) skb_dst(skb)) && - __ip_vs_is_local_route6(ort))) { - IP_VS_DBG_RL("Redirect from non-local address %pI6c to local " - "requires NAT method, dest: %pI6c\n", - &ipv6_hdr(skb)->daddr, daddr); - dst_release(&rt->dst); - return NULL; + if (likely(!local)) { + if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI6c to non-local address, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err_put; + } + } else { + ort = (struct rt6_info *) skb_dst(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !__ip_vs_is_local_route6(ort)) { + IP_VS_DBG_RL("Redirect from non-local address %pI6c " + "to local requires NAT method, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->daddr, daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; } - if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c " - "to non-local address, dest: %pI6c\n", - &ipv6_hdr(skb)->saddr, daddr); - dst_release(&rt->dst); - return NULL; + + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + ort = (struct rt6_info *) skb_dst(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } - return rt; + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #endif -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) { - struct dst_entry *old_dst; + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset(skb); + skb_forward_csum(skb); + } + return ret; +} + +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); - dest->dst_saddr.ip = 0; + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; } -#define IP_VS_XMIT_TUNNEL(skb, cp) \ -({ \ - int __ret = NF_ACCEPT; \ - \ - (skb)->ipvs_property = 1; \ - if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ - __ret = ip_vs_confirm_conntrack(skb); \ - if (__ret == NF_ACCEPT) { \ - nf_reset(skb); \ - skb_forward_csum(skb); \ - } \ - __ret; \ -}) - -#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - else \ - ip_vs_update_conntrack(skb, cp, 1); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) - -#define IP_VS_XMIT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} /* @@ -430,7 +537,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { /* we do not touch skb and do not need pskb ptr */ - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } @@ -443,52 +550,29 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - int mtu; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), - IP_VS_RT_MODE_NON_LOCAL, NULL))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, + NULL) < 0) goto tx_error; - } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -496,60 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; - EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0, - IP_VS_RT_MODE_NON_LOCAL); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -564,29 +615,30 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - int local; + int local, rc, was_input; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -600,57 +652,31 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, - "ip_vs_nat_xmit(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) - goto tx_error_put; + goto tx_error; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - if (!local) { - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -660,49 +686,48 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; - int local; + int local, rc; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph->fragoffs)) { + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->len, sizeof(_pt), &_pt); + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR)))) - goto tx_error_icmp; - local = __ip_vs_is_local_route6(rt); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -716,7 +741,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif @@ -727,46 +752,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, iph)) + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -776,20 +776,17 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; -tx_error_icmp: - dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); + rcu_read_unlock(); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -826,56 +823,40 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, __be16 df; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(tos), IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_CONNECT, - &saddr))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } + rt = skb_rtable(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - if (mtu < 68) { - IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error_put; - } - if (rt_is_output_route(skb_rtable(skb))) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - /* Copy DF, reset fragment offset and MF */ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; - if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } + + if (!new_skb) + goto tx_error; consume_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); @@ -890,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -911,25 +888,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 @@ -943,60 +917,37 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *old_iph = ipv6_hdr(skb); struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, - &saddr, 1, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } + rt = (struct rt6_info *) skb_dst(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); - if (mtu < IPV6_MIN_MTU) { - IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, - IPV6_MIN_MTU); - goto tx_error_put; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - - /* MTU checking: Notice that 'mtu' have been adjusted before hand */ - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->dst); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } + + if (!new_skb) + goto tx_error; consume_skb(skb); skb = new_skb; old_iph = ipv6_hdr(skb); @@ -1008,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -1029,25 +976,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip6_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -1060,59 +1004,36 @@ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_KNOWN_NH, NULL))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } ip_send_check(ip_hdr(skb)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1120,64 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1194,10 +1087,9 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct rtable *rt; /* Route to the other host */ - int mtu; int rc; int local; - int rt_mode; + int rt_mode, was_input; EnterFunction(10); @@ -1217,16 +1109,17 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * mangle and send the packet here (only for VS/NAT) */ + was_input = rt_is_input_route(skb_rtable(skb)); /* LOCALNODE from FORWARD hook is not supported */ rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(ip_hdr(skb)->tos), - rt_mode, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic @@ -1241,82 +1134,51 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp(skb, pp, cp, 0); - if (!local) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); goto out; - tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, - struct ip_vs_iphdr *iph) + struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; int rc; int local; int rt_mode; @@ -1328,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp, iph); + rc = cp->packet_xmit(skb, cp, pp, ipvsh); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -1344,11 +1206,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, rt_mode))) - goto tx_error_icmp; - - local = __ip_vs_is_local_route6(rt); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1362,7 +1225,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; + goto tx_error; } } #endif @@ -1373,60 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp_v6(skb, pp, cp, 0); - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); goto out; -tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index dbdaa114926..b8b95f4027c 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -2,6 +2,7 @@ * * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> * based on HW's ip_conntrack_irc.c as well as other modules + * (C) 2006 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c8e001a9c45..0283baedcdf 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -5,6 +5,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -48,6 +49,7 @@ #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -264,7 +266,7 @@ static void death_by_event(unsigned long ul_conntrack) if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); + (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); return; } @@ -283,7 +285,7 @@ void nf_ct_dying_timeout(struct nf_conn *ct) /* set a new timer to retry event delivery */ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); + (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); } EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); @@ -1259,7 +1261,7 @@ void nf_ct_iterate_cleanup(struct net *net, EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); struct __nf_ct_flush_report { - u32 pid; + u32 portid; int report; }; @@ -1274,7 +1276,7 @@ static int kill_report(struct nf_conn *i, void *data) /* If we fail to deliver the event, death_by_timeout() will retry */ if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->pid, fr->report) < 0) + fr->portid, fr->report) < 0) return 1; /* Avoid the delivery of the destroy event in death_by_timeout(). */ @@ -1297,10 +1299,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { struct __nf_ct_flush_report fr = { - .pid = pid, + .portid = portid, .report = report, }; nf_ct_iterate_cleanup(net, kill_report, &fr); @@ -1364,30 +1366,48 @@ void nf_conntrack_cleanup_end(void) */ void nf_conntrack_cleanup_net(struct net *net) { + LIST_HEAD(single); + + list_add(&net->exit_list, &single); + nf_conntrack_cleanup_net_list(&single); +} + +void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) +{ + int busy; + struct net *net; + /* * This makes sure all current packets have passed through * netfilter framework. Roll on, two-stage module * delete... */ synchronize_net(); - i_see_dead_people: - nf_ct_iterate_cleanup(net, kill_all, NULL); - nf_ct_release_dying_list(net); - if (atomic_read(&net->ct.count) != 0) { +i_see_dead_people: + busy = 0; + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_release_dying_list(net); + if (atomic_read(&net->ct.count) != 0) + busy = 1; + } + if (busy) { schedule(); goto i_see_dead_people; } - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); - nf_conntrack_proto_pernet_fini(net); - nf_conntrack_helper_pernet_fini(net); - nf_conntrack_ecache_pernet_fini(net); - nf_conntrack_tstamp_pernet_fini(net); - nf_conntrack_acct_pernet_fini(net); - nf_conntrack_expect_pernet_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); - free_percpu(net->ct.stat); + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_proto_pernet_fini(net); + nf_conntrack_helper_pernet_fini(net); + nf_conntrack_ecache_pernet_fini(net); + nf_conntrack_tstamp_pernet_fini(net); + nf_conntrack_acct_pernet_fini(net); + nf_conntrack_expect_pernet_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); + free_percpu(net->ct.stat); + } } void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index b5d2eb8bf0d..1df17614656 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -1,8 +1,10 @@ /* Event cache for netfilter. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> +/* + * (C) 2005 Harald Welte <laforge@gnumonks.org> + * (C) 2005 Patrick McHardy <kaber@trash.net> + * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 8c10e3db3d9..c63b618cd61 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (c) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -40,7 +41,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report) + u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -54,7 +55,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del(&exp->lnode); master_help->expecting[exp->class]--; - nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -412,7 +413,7 @@ out: } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report) + u32 portid, int report) { int ret; @@ -425,7 +426,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, if (ret < 0) goto out; spin_unlock_bh(&nf_conntrack_lock); - nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 62fb8faedb8..6b217074237 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 7df7b36d2e2..bdebd03bc8c 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -2,6 +2,7 @@ * H.323 connection tracking helper * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * This source code is licensed under General Public License version 2. * diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 94b4b9853f6..974a2a4adef 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -353,7 +354,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, /* rcu_read_lock()ed by nf_hook_slow */ helper = rcu_dereference(help->helper); - nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, + nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); va_end(args); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 70985c5d0ff..0fd2976db7e 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -1,6 +1,7 @@ /* IRC extension for IP connection tracking, Version 1.21 * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> * based on RR's ip_conntrack_ftp.c + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9904b15f600..6d0f8a17c5b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2409,6 +2409,92 @@ out: return skb->len; } +static int +ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nf_conn *ct = cb->data; + struct nf_conn_help *help = nfct_help(ct); + u_int8_t l3proto = nfmsg->nfgen_family; + + if (cb->args[0]) + return 0; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; +restart: + hlist_for_each_entry(exp, &help->expectations, lnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + cb->args[0] = 1; +out: + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); + + return skb->len; +} + +static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int err; + struct net *net = sock_net(ctnl); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + u16 zone = 0; + struct netlink_dump_control c = { + .dump = ctnetlink_exp_ct_dump_table, + .done = ctnetlink_exp_done, + }; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + } + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + c.data = ct; + + err = netlink_dump_start(ctnl, skb, nlh, &c); + nf_ct_put(ct); + + return err; +} + static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, @@ -2439,11 +2525,15 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .dump = ctnetlink_exp_dump_table, - .done = ctnetlink_exp_done, - }; - return netlink_dump_start(ctnl, skb, nlh, &c); + if (cda[CTA_EXPECT_MASTER]) + return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda); + else { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_dump_table, + .done = ctnetlink_exp_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } } err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index e6678d2b624..7bd03decd36 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -11,6 +11,8 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * * Limitations: * - We blindly assume that control connections are always * established in PNS->PAC direction. This is a violation diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 58ab4050830..0ab9636ac57 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index ba65b2041eb..a99b6c3427b 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -456,7 +456,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, + NULL, msg); return false; } @@ -542,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid packet ignored "); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid state transition "); return -NF_ACCEPT; } @@ -613,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 155ce9f8a0d..9d9c0dade60 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -21,6 +21,7 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index ec83536def9..1314d33f6bc 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -1,6 +1,9 @@ /* * Connection tracking protocol helper module for SCTP. * + * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> + * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> + * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. * diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 83876e9877f..4d4d8f1d01f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,5 +1,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -720,7 +722,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tn->tcp_be_liberal) res = true; if (!res && LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? @@ -772,7 +774,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -780,7 +782,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -793,7 +795,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -802,7 +804,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -949,7 +951,7 @@ static int tcp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid packet ignored in " "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; @@ -959,7 +961,7 @@ static int tcp_packet(struct nf_conn *ct, dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_CLOSE: @@ -969,8 +971,8 @@ static int tcp_packet(struct nf_conn *ct, /* Invalid RST */ spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid RST "); + nf_log_packet(net, pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid RST "); return -NF_ACCEPT; } if (index == TCP_RST_SET diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 59623cc56e8..9d7721cbce4 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -119,7 +120,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: short packet "); return -NF_ACCEPT; } @@ -127,7 +128,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -143,7 +144,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index ca969f6273f..2750e6c69f8 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -131,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: short packet "); return -NF_ACCEPT; } @@ -141,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, cscov = udplen; else if (cscov < sizeof(*hdr) || cscov > udplen) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: invalid checksum coverage "); return -NF_ACCEPT; } @@ -149,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* UDPLITE mandates checksums */ if (!hdr->check) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: checksum missing "); return -NF_ACCEPT; } @@ -159,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: bad UDPLite checksum "); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 0e7d423324c..e0c4373b474 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1593,10 +1593,8 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, end += strlen("\r\n\r\n") + clen; msglen = origlen = end - dptr; - if (msglen > datalen) { - nf_ct_helper_log(skb, ct, "incomplete/bad SIP message"); - return NF_DROP; - } + if (msglen > datalen) + return NF_ACCEPT; ret = process_sip_msg(skb, ct, protoff, dataoff, &dptr, &msglen); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index fedee394366..bd700b4013c 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -545,16 +546,20 @@ out_init: return ret; } -static void nf_conntrack_pernet_exit(struct net *net) +static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { - nf_conntrack_standalone_fini_sysctl(net); - nf_conntrack_standalone_fini_proc(net); - nf_conntrack_cleanup_net(net); + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + nf_conntrack_standalone_fini_sysctl(net); + nf_conntrack_standalone_fini_proc(net); + } + nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { - .init = nf_conntrack_pernet_init, - .exit = nf_conntrack_pernet_exit, + .init = nf_conntrack_pernet_init, + .exit_batch = nf_conntrack_pernet_exit, }; static int __init nf_conntrack_standalone_init(void) diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index e9936c83020..e68ab4fbd71 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -1,5 +1,5 @@ /* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 9e312695c81..388656d5a9e 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,7 +16,6 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); @@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) return NULL; } +void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +{ + const struct nf_logger *log; + + if (pf == NFPROTO_UNSPEC) + return; + + mutex_lock(&nf_log_mutex); + log = rcu_dereference_protected(net->nf.nf_loggers[pf], + lockdep_is_held(&nf_log_mutex)); + if (log == NULL) + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); + + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_set); + +void nf_log_unset(struct net *net, const struct nf_logger *logger) +{ + int i; + const struct nf_logger *log; + + mutex_lock(&nf_log_mutex); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = rcu_dereference_protected(net->nf.nf_loggers[i], + lockdep_is_held(&nf_log_mutex)); + if (log == logger) + RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); + } + mutex_unlock(&nf_log_mutex); + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_log_unset); + /* return EEXIST if the same logger is registered, 0 on success. */ int nf_log_register(u_int8_t pf, struct nf_logger *logger) { - const struct nf_logger *llog; int i; - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(logger->list); i++) @@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) } else { /* register at end of list to honor first register win */ list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); - llog = rcu_dereference_protected(nf_loggers[pf], - lockdep_is_held(&nf_log_mutex)); - if (llog == NULL) - rcu_assign_pointer(nf_loggers[pf], logger); } mutex_unlock(&nf_log_mutex); @@ -66,49 +94,43 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { - const struct nf_logger *c_logger; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { - c_logger = rcu_dereference_protected(nf_loggers[i], - lockdep_is_held(&nf_log_mutex)); - if (c_logger == logger) - RCU_INIT_POINTER(nf_loggers[i], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) list_del(&logger->list[i]); - } mutex_unlock(&nf_log_mutex); - - synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); -int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +int nf_log_bind_pf(struct net *net, u_int8_t pf, + const struct nf_logger *logger) { - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return -EINVAL; mutex_lock(&nf_log_mutex); if (__find_logger(pf, logger->name) == NULL) { mutex_unlock(&nf_log_mutex); return -ENOENT; } - rcu_assign_pointer(nf_loggers[pf], logger); + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); return 0; } EXPORT_SYMBOL(nf_log_bind_pf); -void nf_log_unbind_pf(u_int8_t pf) +void nf_log_unbind_pf(struct net *net, u_int8_t pf) { - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return; mutex_lock(&nf_log_mutex); - RCU_INIT_POINTER(nf_loggers[pf], NULL); + RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL); mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_log_packet(u_int8_t pf, +void nf_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -121,7 +143,7 @@ void nf_log_packet(u_int8_t pf, const struct nf_logger *logger; rcu_read_lock(); - logger = rcu_dereference(nf_loggers[pf]); + logger = rcu_dereference(net->nf.nf_loggers[pf]); if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); @@ -135,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet); #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); + mutex_lock(&nf_log_mutex); - if (*pos >= ARRAY_SIZE(nf_loggers)) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -145,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos) static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct net *net = seq_file_net(s); + (*pos)++; - if (*pos >= ARRAY_SIZE(nf_loggers)) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -164,8 +190,9 @@ static int seq_show(struct seq_file *s, void *v) const struct nf_logger *logger; struct nf_logger *t; int ret; + struct net *net = seq_file_net(s); - logger = rcu_dereference_protected(nf_loggers[*pos], + logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], lockdep_is_held(&nf_log_mutex)); if (!logger) @@ -199,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = { static int nflog_open(struct inode *inode, struct file *file) { - return seq_open(file, &nflog_seq_ops); + return seq_open_net(inode, file, &nflog_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations nflog_file_ops = { @@ -207,7 +235,7 @@ static const struct file_operations nflog_file_ops = { .open = nflog_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; @@ -216,7 +244,6 @@ static const struct file_operations nflog_file_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; -static struct ctl_table_header *nf_log_dir_header; static int nf_log_proc_dostring(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -226,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, size_t size = *lenp; int r = 0; int tindex = (unsigned long)table->extra1; + struct net *net = current->nsproxy->net_ns; if (write) { if (size > sizeof(buf)) @@ -234,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, return -EFAULT; if (!strcmp(buf, "NONE")) { - nf_log_unbind_pf(tindex); + nf_log_unbind_pf(net, tindex); return 0; } mutex_lock(&nf_log_mutex); @@ -243,11 +271,11 @@ static int nf_log_proc_dostring(ctl_table *table, int write, mutex_unlock(&nf_log_mutex); return -ENOENT; } - rcu_assign_pointer(nf_loggers[tindex], logger); + rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { mutex_lock(&nf_log_mutex); - logger = rcu_dereference_protected(nf_loggers[tindex], + logger = rcu_dereference_protected(net->nf.nf_loggers[tindex], lockdep_is_held(&nf_log_mutex)); if (!logger) table->data = "NONE"; @@ -260,49 +288,111 @@ static int nf_log_proc_dostring(ctl_table *table, int write, return r; } -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net) { int i; - - for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { - snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); - nf_log_sysctl_table[i].procname = - nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; - nf_log_sysctl_table[i].data = NULL; - nf_log_sysctl_table[i].maxlen = - NFLOGGER_NAME_LEN * sizeof(char); - nf_log_sysctl_table[i].mode = 0644; - nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; - nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; + struct ctl_table *table; + + table = nf_log_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_log_sysctl_table, + sizeof(nf_log_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } else { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i], + 3, "%d", i); + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = + nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = + (void *)(unsigned long) i; + } } - nf_log_dir_header = register_net_sysctl(&init_net, "net/netfilter/nf_log", - nf_log_sysctl_table); - if (!nf_log_dir_header) - return -ENOMEM; + net->nf.nf_log_dir_header = register_net_sysctl(net, + "net/netfilter/nf_log", + table); + if (!net->nf.nf_log_dir_header) + goto err_reg; return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void netfilter_log_sysctl_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->nf.nf_log_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_log_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); } #else -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net) { return 0; } + +static void netfilter_log_sysctl_exit(struct net *net) +{ +} #endif /* CONFIG_SYSCTL */ -int __init netfilter_log_init(void) +static int __net_init nf_log_net_init(struct net *net) { - int i, r; + int ret = -ENOMEM; + #ifdef CONFIG_PROC_FS if (!proc_create("nf_log", S_IRUGO, - proc_net_netfilter, &nflog_file_ops)) - return -1; + net->nf.proc_netfilter, &nflog_file_ops)) + return ret; #endif + ret = netfilter_log_sysctl_init(net); + if (ret < 0) + goto out_sysctl; + + return 0; - /* Errors will trigger panic, unroll on error is unnecessary. */ - r = netfilter_log_sysctl_init(); - if (r < 0) - return r; +out_sysctl: + /* For init_net: errors will trigger panic, don't unroll on error. */ + if (!net_eq(net, &init_net)) + remove_proc_entry("nf_log", net->nf.proc_netfilter); + + return ret; +} + +static void __net_exit nf_log_net_exit(struct net *net) +{ + netfilter_log_sysctl_exit(net); + remove_proc_entry("nf_log", net->nf.proc_netfilter); +} + +static struct pernet_operations nf_log_net_ops = { + .init = nf_log_net_init, + .exit = nf_log_net_exit, +}; + +int __init netfilter_log_init(void) +{ + int i, ret; + + ret = register_pernet_subsys(&nf_log_net_ops); + if (ret < 0) + return ret; for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) INIT_LIST_HEAD(&(nf_loggers_l[i])); diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index 3b67c9d1127..eb772380a20 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -1,6 +1,7 @@ /* Amanda extension for TCP NAT alteration. * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> * based on a copy of HW's ip_nat_irc.c as well as other modules + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 8d5769c6d16..038eee5c8f8 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -87,9 +87,11 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) struct flowi fl; unsigned int hh_len; struct dst_entry *dst; + int err; - if (xfrm_decode_session(skb, &fl, family) < 0) - return -1; + err = xfrm_decode_session(skb, &fl, family); + if (err < 0) + return err; dst = skb_dst(skb); if (dst->xfrm) @@ -98,7 +100,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); if (IS_ERR(dst)) - return -1; + return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); @@ -107,7 +109,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) - return -1; + return -ENOMEM; return 0; } EXPORT_SYMBOL(nf_xfrm_me_harder); @@ -467,33 +469,22 @@ EXPORT_SYMBOL_GPL(nf_nat_packet); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; - bool hash; }; -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int nf_nat_proto_clean(struct nf_conn *i, void *data) +/* kill conntracks with affected NAT section */ +static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; struct nf_conn_nat *nat = nfct_nat(i); if (!nat) return 0; - if (!(i->status & IPS_SRC_NAT_DONE)) - return 0; + if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; - if (clean->hash) { - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); - spin_unlock_bh(&nf_nat_lock); - } else { - memset(nat, 0, sizeof(*nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | - IPS_SEQ_ADJUST); - } - return 0; + return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) @@ -505,16 +496,8 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) struct net *net; rtnl_lock(); - /* Step 1 - remove from bysource hash */ - clean.hash = true; for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); - synchronize_rcu(); - - /* Step 2 - clean NAT section */ - clean.hash = false; - for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); rtnl_unlock(); } @@ -526,16 +509,9 @@ static void nf_nat_l3proto_clean(u8 l3proto) struct net *net; rtnl_lock(); - /* Step 1 - remove from bysource hash */ - clean.hash = true; - for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); - synchronize_rcu(); - /* Step 2 - clean NAT section */ - clean.hash = false; for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); rtnl_unlock(); } @@ -773,7 +749,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_clean, &clean); + nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 23c2b38676a..5fea563afe3 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -2,6 +2,7 @@ * * (C) 2000-2002 Harald Welte <laforge@netfilter.org> * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index e64faa5ca89..396e55d46f9 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -36,7 +36,7 @@ sctp_manip_pkt(struct sk_buff *skb, { struct sk_buff *frag; sctp_sctphdr_t *hdr; - __be32 crc32; + __u32 crc32; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; @@ -55,8 +55,7 @@ sctp_manip_pkt(struct sk_buff *skb, skb_walk_frags(skb, frag) crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), crc32); - crc32 = sctp_end_cksum(crc32); - hdr->checksum = crc32; + hdr->checksum = sctp_end_cksum(crc32); return true; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d812c1235b3..5d24b1fdb59 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -1,3 +1,8 @@ +/* + * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 + */ + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> @@ -40,7 +45,7 @@ void nf_unregister_queue_handler(void) } EXPORT_SYMBOL(nf_unregister_queue_handler); -static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { /* Release those devices we held, or Alexey will kill me. */ if (entry->indev) @@ -60,12 +65,41 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) /* Drop reference to owner of hook which queued us. */ module_put(entry->elem->owner); } +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +/* Bump dev refs so they don't vanish while packet is out */ +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) +{ + if (!try_module_get(entry->elem->owner)) + return false; + + if (entry->indev) + dev_hold(entry->indev); + if (entry->outdev) + dev_hold(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + struct net_device *physdev; + + physdev = nf_bridge->physindev; + if (physdev) + dev_hold(physdev); + physdev = nf_bridge->physoutdev; + if (physdev) + dev_hold(physdev); + } +#endif + + return true; +} +EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); /* * Any packet that leaves via this function must come back * through nf_reinject(). */ -static int __nf_queue(struct sk_buff *skb, +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, unsigned int hook, struct net_device *indev, @@ -75,10 +109,6 @@ static int __nf_queue(struct sk_buff *skb, { int status = -ENOENT; struct nf_queue_entry *entry = NULL; -#ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev; - struct net_device *physoutdev; -#endif const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; @@ -109,28 +139,13 @@ static int __nf_queue(struct sk_buff *skb, .indev = indev, .outdev = outdev, .okfn = okfn, + .size = sizeof(*entry) + afinfo->route_key_size, }; - /* If it's going away, ignore hook. */ - if (!try_module_get(entry->elem->owner)) { + if (!nf_queue_entry_get_refs(entry)) { status = -ECANCELED; goto err_unlock; } - /* Bump dev refs so they don't vanish while packet is out */ - if (indev) - dev_hold(indev); - if (outdev) - dev_hold(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; - if (physindev) - dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev) - dev_hold(physoutdev); - } -#endif skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); @@ -151,87 +166,6 @@ err: return status; } -#ifdef CONFIG_BRIDGE_NETFILTER -/* When called from bridge netfilter, skb->data must point to MAC header - * before calling skb_gso_segment(). Else, original MAC header is lost - * and segmented skbs will be sent to wrong destination. - */ -static void nf_bridge_adjust_skb_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_push(skb, skb->network_header - skb->mac_header); -} - -static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_pull(skb, skb->network_header - skb->mac_header); -} -#else -#define nf_bridge_adjust_skb_data(s) do {} while (0) -#define nf_bridge_adjust_segmented_data(s) do {} while (0) -#endif - -int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, - u_int8_t pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum) -{ - struct sk_buff *segs; - int err = -EINVAL; - unsigned int queued; - - if (!skb_is_gso(skb)) - return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - queuenum); - - switch (pf) { - case NFPROTO_IPV4: - skb->protocol = htons(ETH_P_IP); - break; - case NFPROTO_IPV6: - skb->protocol = htons(ETH_P_IPV6); - break; - } - - nf_bridge_adjust_skb_data(skb); - segs = skb_gso_segment(skb, 0); - /* Does not use PTR_ERR to limit the number of error codes that can be - * returned by nf_queue. For instance, callers rely on -ECANCELED to mean - * 'ignore this hook'. - */ - if (IS_ERR(segs)) - goto out_err; - queued = 0; - err = 0; - do { - struct sk_buff *nskb = segs->next; - - segs->next = NULL; - if (err == 0) { - nf_bridge_adjust_segmented_data(segs); - err = __nf_queue(segs, elem, pf, hook, indev, - outdev, okfn, queuenum); - } - if (err == 0) - queued++; - else - kfree_skb(segs); - segs = nskb; - } while (segs); - - if (queued) { - kfree_skb(skb); - return 0; - } - out_err: - nf_bridge_adjust_segmented_data(skb); - return err; -} - void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { struct sk_buff *skb = entry->skb; @@ -271,9 +205,9 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = __nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_QBITS); + err = nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ECANCELED) goto next_hook; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 0b1b32cda30..572d87dc116 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -24,10 +24,9 @@ #include <linux/skbuff.h> #include <asm/uaccess.h> #include <net/sock.h> -#include <net/netlink.h> #include <linux/init.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); @@ -113,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ + return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, pid, group, error); + return netlink_set_err(net->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, + int flags) { - return netlink_unicast(net->nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, portid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); @@ -144,7 +151,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EPERM; /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg))) + if (nlmsg_len(nlh) < sizeof(struct nfgenmsg)) return 0; type = nlh->nlmsg_type; @@ -172,7 +179,7 @@ replay: } { - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f248db57297..faf1e9300d8 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -3,6 +3,7 @@ * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> @@ -19,7 +20,7 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/spinlock.h> @@ -32,6 +33,7 @@ #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> #include <net/netfilter/nfnetlink_log.h> #include <linux/atomic.h> @@ -56,6 +58,7 @@ struct nfulnl_instance { unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; + struct net *net; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ int peer_portid; /* PORTID of the peer process */ @@ -71,25 +74,34 @@ struct nfulnl_instance { struct rcu_head rcu; }; -static DEFINE_SPINLOCK(instances_lock); -static atomic_t global_seq; - #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; static unsigned int hash_init; +static int nfnl_log_net_id __read_mostly; + +struct nfnl_log_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; + atomic_t global_seq; +}; + +static struct nfnl_log_net *nfnl_log_pernet(struct net *net) +{ + return net_generic(net, nfnl_log_net_id); +} + static inline u_int8_t instance_hashfn(u_int16_t group_num) { return ((group_num & 0xff) % INSTANCE_BUCKETS); } static struct nfulnl_instance * -__instance_lookup(u_int16_t group_num) +__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) { struct hlist_head *head; struct nfulnl_instance *inst; - head = &instance_table[instance_hashfn(group_num)]; + head = &log->instance_table[instance_hashfn(group_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; @@ -104,12 +116,12 @@ instance_get(struct nfulnl_instance *inst) } static struct nfulnl_instance * -instance_lookup_get(u_int16_t group_num) +instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) { struct nfulnl_instance *inst; rcu_read_lock_bh(); - inst = __instance_lookup(group_num); + inst = __instance_lookup(log, group_num); if (inst && !atomic_inc_not_zero(&inst->use)) inst = NULL; rcu_read_unlock_bh(); @@ -119,7 +131,11 @@ instance_lookup_get(u_int16_t group_num) static void nfulnl_instance_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct nfulnl_instance, rcu)); + struct nfulnl_instance *inst = + container_of(head, struct nfulnl_instance, rcu); + + put_net(inst->net); + kfree(inst); module_put(THIS_MODULE); } @@ -133,13 +149,15 @@ instance_put(struct nfulnl_instance *inst) static void nfulnl_timer(unsigned long data); static struct nfulnl_instance * -instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns) +instance_create(struct net *net, u_int16_t group_num, + int portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; + struct nfnl_log_net *log = nfnl_log_pernet(net); int err; - spin_lock_bh(&instances_lock); - if (__instance_lookup(group_num)) { + spin_lock_bh(&log->instances_lock); + if (__instance_lookup(log, group_num)) { err = -EEXIST; goto out_unlock; } @@ -163,6 +181,7 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns) setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); + inst->net = get_net(net); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; @@ -174,14 +193,15 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns) inst->copy_range = NFULNL_COPY_RANGE_MAX; hlist_add_head_rcu(&inst->hlist, - &instance_table[instance_hashfn(group_num)]); + &log->instance_table[instance_hashfn(group_num)]); + - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return inst; out_unlock: - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return ERR_PTR(err); } @@ -210,11 +230,12 @@ __instance_destroy(struct nfulnl_instance *inst) } static inline void -instance_destroy(struct nfulnl_instance *inst) +instance_destroy(struct nfnl_log_net *log, + struct nfulnl_instance *inst) { - spin_lock_bh(&instances_lock); + spin_lock_bh(&log->instances_lock); __instance_destroy(inst); - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } static int @@ -298,7 +319,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) } static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; @@ -307,13 +328,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = alloc_skb(n, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = alloc_skb(pkt_size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, pkt_size, + peer_portid, GFP_ATOMIC); if (!skb) pr_err("nfnetlink_log: can't even alloc %u bytes\n", pkt_size); @@ -336,7 +358,7 @@ __nfulnl_send(struct nfulnl_instance *inst) if (!nlh) goto out; } - status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_portid, + status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, MSG_DONTWAIT); inst->qlen = 0; @@ -370,7 +392,8 @@ nfulnl_timer(unsigned long data) /* This is an inline function, we don't really care about a long * list of arguments */ static inline int -__build_packet_message(struct nfulnl_instance *inst, +__build_packet_message(struct nfnl_log_net *log, + struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, u_int8_t pf, @@ -536,7 +559,7 @@ __build_packet_message(struct nfulnl_instance *inst, /* global sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, - htonl(atomic_inc_return(&global_seq)))) + htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; if (data_len) { @@ -592,13 +615,15 @@ nfulnl_log_packet(u_int8_t pf, const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; + struct net *net = dev_net(in ? in : out); + struct nfnl_log_net *log = nfnl_log_pernet(net); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; - inst = instance_lookup_get(li->u.ulog.group); + inst = instance_lookup_get(log, li->u.ulog.group); if (!inst) return; @@ -609,7 +634,7 @@ nfulnl_log_packet(u_int8_t pf, /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -673,14 +698,15 @@ nfulnl_log_packet(u_int8_t pf, } if (!inst->skb) { - inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, + size); if (!inst->skb) goto alloc_failure; } inst->qlen++; - __build_packet_message(inst, skb, data_len, pf, + __build_packet_message(log, inst, skb, data_len, pf, hooknum, in, out, prefix, plen); if (inst->qlen >= qthreshold) @@ -709,24 +735,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_log_net *log = nfnl_log_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ - spin_lock_bh(&instances_lock); + spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfulnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &log->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { - if ((net_eq(n->net, &init_net)) && - (n->portid == inst->peer_portid)) + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } @@ -767,6 +793,8 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, u_int16_t group_num = ntohs(nfmsg->res_id); struct nfulnl_instance *inst; struct nfulnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; if (nfula[NFULA_CFG_CMD]) { @@ -776,14 +804,14 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_bind_pf(pf, &nfulnl_logger); + return nf_log_bind_pf(net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unbind_pf(pf); + nf_log_unbind_pf(net, pf); return 0; } } - inst = instance_lookup_get(group_num); + inst = instance_lookup_get(log, group_num); if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto out_put; @@ -797,9 +825,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; } - inst = instance_create(group_num, + inst = instance_create(net, group_num, NETLINK_CB(skb).portid, - sk_user_ns(NETLINK_CB(skb).ssk)); + sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; @@ -811,7 +839,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out; } - instance_destroy(inst); + instance_destroy(log, inst); goto out_put; default: ret = -ENOTSUPP; @@ -894,55 +922,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; -static struct hlist_node *get_first(struct iter_state *st) +static struct hlist_node *get_first(struct net *net, struct iter_state *st) { + struct nfnl_log_net *log; if (!st) return NULL; + log = nfnl_log_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + struct hlist_head *head = &log->instance_table[st->bucket]; + + if (!hlist_empty(head)) + return rcu_dereference_bh(hlist_first_rcu(head)); } return NULL; } -static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +static struct hlist_node *get_next(struct net *net, struct iter_state *st, + struct hlist_node *h) { h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { + struct nfnl_log_net *log; + struct hlist_head *head; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + log = nfnl_log_pernet(net); + head = &log->instance_table[st->bucket]; + h = rcu_dereference_bh(hlist_first_rcu(head)); } return h; } -static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +static struct hlist_node *get_idx(struct net *net, struct iter_state *st, + loff_t pos) { struct hlist_node *head; - head = get_first(st); + head = get_first(net, st); if (head) - while (pos && (head = get_next(st, head))) + while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) +static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(rcu_bh) { rcu_read_lock_bh(); - return get_idx(seq->private, *pos); + return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - return get_next(s->private, v); + return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) @@ -971,8 +1012,8 @@ static const struct seq_operations nful_seq_ops = { static int nful_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &nful_seq_ops, - sizeof(struct iter_state)); + return seq_open_net(inode, file, &nful_seq_ops, + sizeof(struct iter_state)); } static const struct file_operations nful_file_ops = { @@ -980,17 +1021,43 @@ static const struct file_operations nful_file_ops = { .open = nful_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_log_init(void) +static int __net_init nfnl_log_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_log_net *log = nfnl_log_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); + INIT_HLIST_HEAD(&log->instance_table[i]); + spin_lock_init(&log->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_log", 0440, + net->nf.proc_netfilter, &nful_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_log_net_exit(struct net *net) +{ + remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +} + +static struct pernet_operations nfnl_log_net_ops = { + .init = nfnl_log_net_init, + .exit = nfnl_log_net_exit, + .id = &nfnl_log_net_id, + .size = sizeof(struct nfnl_log_net), +}; + +static int __init nfnetlink_log_init(void) +{ + int status = -ENOMEM; /* it's not really all that important to have a random value, so * we can do this from the init function, even if there hasn't @@ -1000,29 +1067,25 @@ static int __init nfnetlink_log_init(void) netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { - printk(KERN_ERR "log: failed to create netlink socket\n"); + pr_err("log: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); if (status < 0) { - printk(KERN_ERR "log: failed to register logger\n"); + pr_err("log: failed to register logger\n"); goto cleanup_subsys; } -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_log", 0440, - proc_net_netfilter, &nful_file_ops)) { - status = -ENOMEM; + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("log: failed to register pernet ops\n"); goto cleanup_logger; } -#endif return status; -#ifdef CONFIG_PROC_FS cleanup_logger: nf_log_unregister(&nfulnl_logger); -#endif cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: @@ -1032,10 +1095,8 @@ cleanup_netlink_notifier: static void __exit nfnetlink_log_fini(void) { + unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_log", proc_net_netfilter); -#endif nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); } diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 42680b2baa1..2e0e835baf7 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -30,6 +30,7 @@ #include <linux/list.h> #include <net/sock.h> #include <net/netfilter/nf_queue.h> +#include <net/netns/generic.h> #include <net/netfilter/nfnetlink_queue.h> #include <linux/atomic.h> @@ -66,23 +67,31 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); -static DEFINE_SPINLOCK(instances_lock); +static int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; +struct nfnl_queue_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ + return net_generic(net, nfnl_queue_net_id); +} static inline u_int8_t instance_hashfn(u_int16_t queue_num) { - return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; + return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * -instance_lookup(u_int16_t queue_num) +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; struct nfqnl_instance *inst; - head = &instance_table[instance_hashfn(queue_num)]; + head = &q->instance_table[instance_hashfn(queue_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; @@ -91,14 +100,15 @@ instance_lookup(u_int16_t queue_num) } static struct nfqnl_instance * -instance_create(u_int16_t queue_num, int portid) +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, + int portid) { struct nfqnl_instance *inst; unsigned int h; int err; - spin_lock(&instances_lock); - if (instance_lookup(queue_num)) { + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } @@ -123,16 +133,16 @@ instance_create(u_int16_t queue_num, int portid) } h = instance_hashfn(queue_num); - hlist_add_head_rcu(&inst->hlist, &instance_table[h]); + hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); return ERR_PTR(err); } @@ -158,11 +168,11 @@ __instance_destroy(struct nfqnl_instance *inst) } static void -instance_destroy(struct nfqnl_instance *inst) +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { - spin_lock(&instances_lock); + spin_lock(&q->instances_lock); __instance_destroy(inst); - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); } static inline void @@ -217,14 +227,71 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_unlock_bh(&queue->lock); } +static void +nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + struct page *page; + unsigned int offset; + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) { + skb_copy_bits(from, 0, skb_put(to, len), len); + return; + } + + if (hlen) { + skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; +} + +static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet) +{ + __u32 flags = 0; + + if (packet->ip_summed == CHECKSUM_PARTIAL) + flags = NFQA_SKB_CSUMNOTREADY; + if (skb_is_gso(packet)) + flags |= NFQA_SKB_GSO; + + return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + static struct sk_buff * nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { - sk_buff_data_t old_tail; size_t size; size_t data_len = 0, cap_len = 0; + int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; @@ -236,7 +303,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nf_conn *ct = NULL; enum ip_conntrack_info uninitialized_var(ctinfo); - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -246,8 +313,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) - + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp) - + nla_total_size(sizeof(u_int32_t))); /* cap_len */ + + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + + nla_total_size(sizeof(u_int32_t)); /* cap_len */ + + if (entskb->tstamp.tv64) + size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); outdev = entry->outdev; @@ -257,7 +327,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, break; case NFQNL_COPY_PACKET: - if (entskb->ip_summed == CHECKSUM_PARTIAL && + if (!(queue->flags & NFQA_CFG_F_GSO) && + entskb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(entskb)) return NULL; @@ -265,7 +336,16 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (data_len == 0 || data_len > entskb->len) data_len = entskb->len; - size += nla_total_size(data_len); + + if (!entskb->head_frag || + skb_headlen(entskb) < L1_CACHE_BYTES || + skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(entskb); + + if (skb_has_frag_list(entskb)) + hlen = entskb->len; + hlen = min_t(int, data_len, hlen); + size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } @@ -273,11 +353,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (queue->flags & NFQA_CFG_F_CONNTRACK) ct = nfqnl_ct_get(entskb, &size, &ctinfo); - skb = alloc_skb(size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, + GFP_ATOMIC); if (!skb) return NULL; - old_tail = skb->tail; nlh = nlmsg_put(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg), 0); @@ -382,31 +462,29 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, goto nla_put_failure; } + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) + goto nla_put_failure; + + if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + goto nla_put_failure; + + if (nfqnl_put_packet_info(skb, entskb)) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; - int sz = nla_attr_size(data_len); - if (skb_tailroom(skb) < nla_total_size(data_len)) { - printk(KERN_WARNING "nf_queue: no tailroom!\n"); - kfree_skb(skb); - return NULL; - } + if (skb_tailroom(skb) < sizeof(*nla) + hlen) + goto nla_put_failure; - nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len)); + nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; - nla->nla_len = sz; + nla->nla_len = nla_attr_size(data_len); - if (skb_copy_bits(entskb, 0, nla_data(nla), data_len)) - BUG(); + nfqnl_zcopy(skb, entskb, data_len, hlen); } - if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) - goto nla_put_failure; - - if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) - goto nla_put_failure; - - nlh->nlmsg_len = skb->tail - old_tail; + nlh->nlmsg_len = skb->len; return skb; nla_put_failure: @@ -416,26 +494,14 @@ nla_put_failure: } static int -nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry) { struct sk_buff *nskb; - struct nfqnl_instance *queue; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; - /* rcu_read_lock()ed by nf_hook_slow() */ - queue = instance_lookup(queuenum); - if (!queue) { - err = -ESRCH; - goto err_out; - } - - if (queue->copy_mode == NFQNL_COPY_NONE) { - err = -EINVAL; - goto err_out; - } - nskb = nfqnl_build_packet_message(queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; @@ -462,7 +528,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, &init_net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); if (err < 0) { queue->queue_user_dropped++; goto err_out_unlock; @@ -483,6 +549,141 @@ err_out: return err; } +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) { + if (nf_queue_entry_get_refs(entry)) + return entry; + kfree(entry); + } + return NULL; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, + struct sk_buff *skb, struct nf_queue_entry *entry) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry_seg); + if (ret) + free_entry(entry_seg); + } + return ret; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + unsigned int queued; + struct nfqnl_instance *queue; + struct sk_buff *skb, *segs; + int err = -ENOBUFS; + struct net *net = dev_net(entry->indev ? + entry->indev : entry->outdev); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(q, queuenum); + if (!queue) + return -ESRCH; + + if (queue->copy_mode == NFQNL_COPY_NONE) + return -EINVAL; + + if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(entry->skb)) + return __nfqnl_enqueue_packet(net, queue, entry); + + skb = entry->skb; + + switch (entry->pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to + * mean 'ignore this hook'. + */ + if (IS_ERR(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (err == 0) + err = __nfqnl_enqueue_packet_gso(net, queue, + segs, entry); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + static int nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) { @@ -575,15 +776,16 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void -nfqnl_dev_drop(int ifindex) +nfqnl_dev_drop(struct net *net, int ifindex) { int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); @@ -600,12 +802,9 @@ nfqnl_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) - nfqnl_dev_drop(dev->ifindex); + nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } @@ -618,24 +817,24 @@ nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ - spin_lock(&instances_lock); + spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { - if ((n->net == &init_net) && - (n->portid == inst->peer_portid)) + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } @@ -656,11 +855,12 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_MARK] = { .type = NLA_U32 }, }; -static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlportid) +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) { struct nfqnl_instance *queue; - queue = instance_lookup(queue_num); + queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); @@ -704,7 +904,11 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, LIST_HEAD(batch_list); u16 queue_num = ntohs(nfmsg->res_id); - queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid); + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); @@ -752,10 +956,13 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, enum ip_conntrack_info uninitialized_var(ctinfo); struct nf_conn *ct = NULL; - queue = instance_lookup(queue_num); - if (!queue) + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); - queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid); + queue = instance_lookup(q, queue_num); + if (!queue) + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); @@ -819,6 +1026,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; struct nfqnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); int ret = 0; if (nfqa[NFQA_CFG_CMD]) { @@ -832,7 +1041,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, } rcu_read_lock(); - queue = instance_lookup(queue_num); + queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; @@ -845,7 +1054,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -EBUSY; goto err_out_unlock; } - queue = instance_create(queue_num, NETLINK_CB(skb).portid); + queue = instance_create(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; @@ -856,7 +1066,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -ENODEV; goto err_out_unlock; } - instance_destroy(queue); + instance_destroy(q, queue); break; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: @@ -950,19 +1160,24 @@ static const struct nfnetlink_subsystem nfqnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; + struct net *net; + struct nfnl_queue_net *q; if (!st) return NULL; + net = seq_file_net(seq); + q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return instance_table[st->bucket].first; + if (!hlist_empty(&q->instance_table[st->bucket])) + return q->instance_table[st->bucket].first; } return NULL; } @@ -970,13 +1185,17 @@ static struct hlist_node *get_first(struct seq_file *seq) static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; + struct net *net = seq_file_net(seq); h = h->next; while (!h) { + struct nfnl_queue_net *q; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = instance_table[st->bucket].first; + q = nfnl_queue_pernet(net); + h = q->instance_table[st->bucket].first; } return h; } @@ -992,11 +1211,11 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) - __acquires(instances_lock) +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { - spin_lock(&instances_lock); - return get_idx(seq, *pos); + spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); + return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) @@ -1006,9 +1225,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) } static void seq_stop(struct seq_file *s, void *v) - __releases(instances_lock) + __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { - spin_unlock(&instances_lock); + spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) @@ -1032,7 +1251,7 @@ static const struct seq_operations nfqnl_seq_ops = { static int nfqnl_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &nfqnl_seq_ops, + return seq_open_net(inode, file, &nfqnl_seq_ops, sizeof(struct iter_state)); } @@ -1041,41 +1260,63 @@ static const struct file_operations nfqnl_file_ops = { .open = nfqnl_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_queue_init(void) +static int __net_init nfnl_queue_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); + INIT_HLIST_HEAD(&q->instance_table[i]); + + spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + net->nf.proc_netfilter, &nfqnl_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ + remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +} + +static struct pernet_operations nfnl_queue_net_ops = { + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ + int status = -ENOMEM; netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { - printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + pr_err("nf_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_queue", 0440, - proc_net_netfilter, &nfqnl_file_ops)) { - status = -ENOMEM; + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); goto cleanup_subsys; } -#endif - register_netdevice_notifier(&nfqnl_dev_notifier); nf_register_queue_handler(&nfqh); return status; -#ifdef CONFIG_PROC_FS cleanup_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); -#endif cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); return status; @@ -1085,9 +1326,7 @@ static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -#endif + unregister_pernet_subsys(&nfnl_queue_net_ops); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 686c7715d77..8b03028cca6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -2,6 +2,7 @@ * x_tables core - Backend for {ip,ip6,arp}_tables * * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on existing ip_tables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling @@ -999,7 +1000,7 @@ static int xt_table_open(struct inode *inode, struct file *file) sizeof(struct xt_names_priv)); if (!ret) { priv = ((struct seq_file *)file->private_data)->private; - priv->af = (unsigned long)PDE(inode)->data; + priv->af = (unsigned long)PDE_DATA(inode); } return ret; } @@ -1147,7 +1148,7 @@ static int xt_match_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = trav; - trav->nfproto = (unsigned long)PDE(inode)->data; + trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1211,7 +1212,7 @@ static int xt_target_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = trav; - trav->nfproto = (unsigned long)PDE(inode)->data; + trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index fa40096940a..fe573f6c9e9 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -474,7 +474,14 @@ ipt_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - struct sbuff *m = sb_open(); + struct sbuff *m; + struct net *net = dev_net(in ? in : out); + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); if (!loginfo) loginfo = &default_loginfo; @@ -798,7 +805,14 @@ ip6t_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - struct sbuff *m = sb_open(); + struct sbuff *m; + struct net *net = dev_net(in ? in : out); + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); if (!loginfo) loginfo = &default_loginfo; @@ -893,23 +907,55 @@ static struct nf_logger ip6t_log_logger __read_mostly = { }; #endif +static int __net_init log_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; +} + +static void __net_exit log_net_exit(struct net *net) +{ + nf_log_unset(net, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unset(net, &ip6t_log_logger); +#endif +} + +static struct pernet_operations log_net_ops = { + .init = log_net_init, + .exit = log_net_exit, +}; + static int __init log_tg_init(void) { int ret; + ret = register_pernet_subsys(&log_net_ops); + if (ret < 0) + goto err_pernet; + ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); if (ret < 0) - return ret; + goto err_target; nf_log_register(NFPROTO_IPV4, &ipt_log_logger); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); #endif return 0; + +err_target: + unregister_pernet_subsys(&log_net_ops); +err_pernet: + return ret; } static void __exit log_tg_exit(void) { + unregister_pernet_subsys(&log_net_ops); nf_log_unregister(&ipt_log_logger); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) nf_log_unregister(&ip6t_log_logger); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 817f9e9f2b1..1e2fae32f81 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -76,22 +76,31 @@ static u32 hash_v6(const struct sk_buff *skb) } #endif -static unsigned int -nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +static u32 +nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v1 *info = par->targinfo; u32 queue = info->queuenum; - if (info->queues_total > 1) { - if (par->family == NFPROTO_IPV4) - queue = (((u64) hash_v4(skb) * info->queues_total) >> - 32) + queue; + if (par->family == NFPROTO_IPV4) + queue += ((u64) hash_v4(skb) * info->queues_total) >> 32; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - else if (par->family == NFPROTO_IPV6) - queue = (((u64) hash_v6(skb) * info->queues_total) >> - 32) + queue; + else if (par->family == NFPROTO_IPV6) + queue += ((u64) hash_v6(skb) * info->queues_total) >> 32; #endif - } + + return queue; +} + +static unsigned int +nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v1 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) + queue = nfqueue_hash(skb, par); + return NF_QUEUE_NR(queue); } @@ -108,7 +117,7 @@ nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) static int nfqueue_tg_check(const struct xt_tgchk_param *par) { - const struct xt_NFQ_info_v2 *info = par->targinfo; + const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; if (unlikely(!rnd_inited)) { @@ -125,11 +134,32 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par) info->queues_total, maxid); return -ERANGE; } - if (par->target->revision == 2 && info->bypass > 1) + if (par->target->revision == 2 && info->flags > 1) return -EINVAL; + if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) + return -EINVAL; + return 0; } +static unsigned int +nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v3 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) { + if (info->flags & NFQ_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = info->queuenum + cpu % info->queues_total; + } else + queue = nfqueue_hash(skb, par); + } + + return NF_QUEUE_NR(queue); +} + static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", @@ -156,6 +186,15 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_NFQ_info_v2), .me = THIS_MODULE, }, + { + .name = "NFQUEUE", + .revision = 3, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v3, + .targetsize = sizeof(struct xt_NFQ_info_v3), + .me = THIS_MODULE, + }, }; static int __init nfqueue_tg_init(void) diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 71a266de5fb..a75240f0d42 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -2,6 +2,7 @@ * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 61805d7b38a..188404b9b00 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -3,6 +3,7 @@ * information. (Superset of Rusty's minimalistic state match.) * * (C) 2001 Marc Boucher (marc@mbsi.ca). + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index f330e8beaf6..9ff035c7140 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -3,6 +3,7 @@ * separately for each hashbucket (sourceip/sourceport/dstip/dstport) * * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * Development of this code was funded by Astaro AG, http://www.astaro.com/ @@ -107,6 +108,7 @@ struct xt_hashlimit_htable { /* seq_file stuff */ struct proc_dir_entry *pde; + const char *name; struct net *net; struct hlist_head hash[0]; /* hashtable itself */ @@ -253,6 +255,11 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; + hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + if (!hinfo->name) { + vfree(hinfo); + return -ENOMEM; + } spin_lock_init(&hinfo->lock); hinfo->pde = proc_create_data(minfo->name, 0, @@ -260,6 +267,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, &dl_file_ops, hinfo); if (hinfo->pde == NULL) { + kfree(hinfo->name); vfree(hinfo); return -ENOMEM; } @@ -330,9 +338,10 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) parent = hashlimit_net->ip6t_hashlimit; if(parent != NULL) - remove_proc_entry(hinfo->pde->name, parent); + remove_proc_entry(hinfo->name, parent); htable_selective_cleanup(hinfo, select_all); + kfree(hinfo->name); vfree(hinfo); } @@ -344,7 +353,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, struct xt_hashlimit_htable *hinfo; hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { - if (!strcmp(name, hinfo->pde->name) && + if (!strcmp(name, hinfo->name) && hinfo->family == family) { hinfo->use++; return hinfo; @@ -841,7 +850,7 @@ static int dl_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode)->data; + sf->private = PDE_DATA(inode); } return ret; } @@ -887,7 +896,7 @@ static void __net_exit hashlimit_proc_net_exit(struct net *net) pde = hashlimit_net->ip6t_hashlimit; hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) - remove_proc_entry(hinfo->pde->name, pde); + remove_proc_entry(hinfo->name, pde); hashlimit_net->ipt_hashlimit = NULL; hashlimit_net->ip6t_hashlimit = NULL; diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index a4c1e4528ca..bef85059655 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -1,5 +1,6 @@ /* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index a5e673d32bd..647d989a01e 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -201,6 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; + struct net *net = dev_net(p->in ? p->in : p->out); if (!info) return false; @@ -325,7 +326,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) fcount++; if (info->flags & XT_OSF_LOG) - nf_log_packet(p->family, p->hooknum, skb, + nf_log_packet(net, p->family, p->hooknum, skb, p->in, p->out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, @@ -341,7 +342,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) rcu_read_unlock(); if (!fcount && (info->flags & XT_OSF_LOG)) - nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL, + nf_log_packet(net, p->family, p->hooknum, skb, p->in, + p->out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index d9cad315229..1e657cf715c 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -401,8 +401,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, ret = -ENOMEM; goto out; } - pde->uid = uid; - pde->gid = gid; + proc_set_user(pde, uid, gid); #endif spin_lock_bh(&recent_lock); list_add_tail(&t->list, &recent_net->tables); @@ -525,14 +524,13 @@ static const struct seq_operations recent_seq_ops = { static int recent_seq_open(struct inode *inode, struct file *file) { - struct proc_dir_entry *pde = PDE(inode); struct recent_iter_state *st; st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); if (st == NULL) return -ENOMEM; - st->table = pde->data; + st->table = PDE_DATA(inode); return 0; } @@ -540,8 +538,7 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - const struct proc_dir_entry *pde = PDE(file_inode(file)); - struct recent_table *t = pde->data; + struct recent_table *t = PDE_DATA(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 865a9e54f3a..31790e789e2 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -30,7 +30,7 @@ MODULE_ALIAS("ip6t_SET"); static inline int match_set(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, - const struct ip_set_adt_opt *opt, int inv) + struct ip_set_adt_opt *opt, int inv) { if (ip_set_test(index, skb, par, opt)) inv = !inv; @@ -38,20 +38,12 @@ match_set(ip_set_id_t index, const struct sk_buff *skb, } #define ADT_OPT(n, f, d, fs, cfs, t) \ -const struct ip_set_adt_opt n = { \ - .family = f, \ - .dim = d, \ - .flags = fs, \ - .cmdflags = cfs, \ - .timeout = t, \ -} -#define ADT_MOPT(n, f, d, fs, cfs, t) \ struct ip_set_adt_opt n = { \ .family = f, \ .dim = d, \ .flags = fs, \ .cmdflags = cfs, \ - .timeout = t, \ + .ext.timeout = t, \ } /* Revision 0 interface: backward compatible with netfilter/iptables */ @@ -197,6 +189,9 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, 0, UINT_MAX); + if (opt.flags & IPSET_RETURN_NOMATCH) + opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; + return match_set(info->match_set.index, skb, par, &opt, info->match_set.flags & IPSET_INV_MATCH); } @@ -305,15 +300,15 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; - ADT_MOPT(add_opt, par->family, info->add_set.dim, - info->add_set.flags, info->flags, info->timeout); + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, info->del_set.flags, 0, UINT_MAX); /* Normalize to fit into jiffies */ - if (add_opt.timeout != IPSET_NO_TIMEOUT && - add_opt.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.timeout = UINT_MAX/MSEC_PER_SEC; + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -325,6 +320,52 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ + switch (info->op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == info->value; + case IPSET_COUNTER_NE: + return counter != info->value; + case IPSET_COUNTER_LT: + return counter < info->value; + case IPSET_COUNTER_GT: + return counter > info->value; + } + return false; +} + +static bool +set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v3 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, info->flags, UINT_MAX); + int ret; + + if (info->packets.op != IPSET_COUNTER_NONE || + info->bytes.op != IPSET_COUNTER_NONE) + opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + + ret = match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); + + if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) + return ret; + + if (!match_counter(opt.ext.packets, &info->packets)) + return 0; + return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v3_checkentry set_match_v1_checkentry +#define set_match_v3_destroy set_match_v1_destroy + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -377,6 +418,27 @@ static struct xt_match set_matches[] __read_mostly = { .destroy = set_match_v1_destroy, .me = THIS_MODULE }, + /* counters support: update, match */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, }; static struct xt_target set_targets[] __read_mostly = { |