From ea781f197d6a835cbb93a0bf88ee1696296ed8aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Mar 2009 21:05:46 +0100 Subject: netfilter: nf_conntrack: use SLAB_DESTROY_BY_RCU and get rid of call_rcu() Use "hlist_nulls" infrastructure we added in 2.6.29 for RCUification of UDP & TCP. This permits an easy conversion from call_rcu() based hash lists to a SLAB_DESTROY_BY_RCU one. Avoiding call_rcu() delay at nf_conn freeing time has numerous gains. First, it doesnt fill RCU queues (up to 10000 elements per cpu). This reduces OOM possibility, if queued elements are not taken into account This reduces latency problems when RCU queue size hits hilimit and triggers emergency mode. - It allows fast reuse of just freed elements, permitting better use of CPU cache. - We delete rcu_head from "struct nf_conn", shrinking size of this structure by 8 or 16 bytes. This patch only takes care of "struct nf_conn". call_rcu() is still used for less critical conntrack parts, that may be converted later if necessary. Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 63 ++++++++++++---------- net/ipv4/netfilter/nf_nat_core.c | 2 +- 2 files changed, 37 insertions(+), 28 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 6ba5c557690..8668a3defda 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -25,40 +25,42 @@ struct ct_iter_state { unsigned int bucket; }; -static struct hlist_node *ct_get_first(struct seq_file *seq) +static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - struct hlist_node *n; + struct hlist_nulls_node *n; for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { n = rcu_dereference(net->ct.hash[st->bucket].first); - if (n) + if (!is_a_nulls(n)) return n; } return NULL; } -static struct hlist_node *ct_get_next(struct seq_file *seq, - struct hlist_node *head) +static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, + struct hlist_nulls_node *head) { struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(head->next); - while (head == NULL) { - if (++st->bucket >= nf_conntrack_htable_size) - return NULL; + while (is_a_nulls(head)) { + if (likely(get_nulls_value(head) == st->bucket)) { + if (++st->bucket >= nf_conntrack_htable_size) + return NULL; + } head = rcu_dereference(net->ct.hash[st->bucket].first); } return head; } -static struct hlist_node *ct_get_idx(struct seq_file *seq, loff_t pos) +static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) { - struct hlist_node *head = ct_get_first(seq); + struct hlist_nulls_node *head = ct_get_first(seq); if (head) while (pos && (head = ct_get_next(seq, head))) @@ -87,69 +89,76 @@ static void ct_seq_stop(struct seq_file *s, void *v) static int ct_seq_show(struct seq_file *s, void *v) { - const struct nf_conntrack_tuple_hash *hash = v; - const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); + struct nf_conntrack_tuple_hash *hash = v; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; + int ret = 0; NF_CT_ASSERT(ct); + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; + /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) - return 0; + goto release; if (nf_ct_l3num(ct) != AF_INET) - return 0; + goto release; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); NF_CT_ASSERT(l3proto); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); NF_CT_ASSERT(l4proto); + ret = -ENOSPC; if (seq_printf(s, "%-8s %u %ld ", l4proto->name, nf_ct_protonum(ct), timer_pending(&ct->timeout) ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) - return -ENOSPC; + goto release; if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) - return -ENOSPC; + goto release; if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l3proto, l4proto)) - return -ENOSPC; + goto release; if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) - return -ENOSPC; + goto release; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) if (seq_printf(s, "[UNREPLIED] ")) - return -ENOSPC; + goto release; if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l3proto, l4proto)) - return -ENOSPC; + goto release; if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) - return -ENOSPC; + goto release; if (test_bit(IPS_ASSURED_BIT, &ct->status)) if (seq_printf(s, "[ASSURED] ")) - return -ENOSPC; + goto release; #ifdef CONFIG_NF_CONNTRACK_MARK if (seq_printf(s, "mark=%u ", ct->mark)) - return -ENOSPC; + goto release; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK if (seq_printf(s, "secmark=%u ", ct->secmark)) - return -ENOSPC; + goto release; #endif if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) - return -ENOSPC; - - return 0; + goto release; + ret = 0; +release: + nf_ct_put(ct); + return ret; } static const struct seq_operations ct_seq_ops = { diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index a65cf692359..fe65187810f 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -679,7 +679,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int __net_init nf_nat_net_init(struct net *net) { net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &net->ipv4.nat_vmalloced); + &net->ipv4.nat_vmalloced, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; -- cgit v1.2.3-70-g09d2