From d7997fe1f4584da12e9c29fb682c18e9bdc13b73 Mon Sep 17 00:00:00 2001
From: Timo Teräs <timo.teras@iki.fi>
Date: Wed, 31 Mar 2010 00:17:06 +0000
Subject: flow: structurize flow cache

Group all per-cpu data to one structure instead of having many
globals. Also prepare the internals so that we can have multiple
instances of the flow cache if needed.

Only the kmem_cache is left as a global as all flow caches share
the same element size, and benefit from using a common cache.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow.c | 223 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 119 insertions(+), 104 deletions(-)

(limited to 'net/core/flow.c')

diff --git a/net/core/flow.c b/net/core/flow.c
index 96015871ece..1d27ca6b421 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -35,104 +35,105 @@ struct flow_cache_entry {
 	atomic_t		*object_ref;
 };
 
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static struct kmem_cache *flow_cachep __read_mostly;
-
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
-	int hash_rnd_recalc;
-	u32 hash_rnd;
-	int count;
+struct flow_cache_percpu {
+	struct flow_cache_entry **	hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-
-#define flow_hash_rnd_recalc(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
-	(per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD	(10 * 60 * HZ)
 
 struct flow_flush_info {
-	atomic_t cpuleft;
-	struct completion completion;
+	struct flow_cache *		cache;
+	atomic_t			cpuleft;
+	struct completion		completion;
 };
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
 
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+struct flow_cache {
+	u32				hash_shift;
+	unsigned long			order;
+	struct flow_cache_percpu *	percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+};
+
+atomic_t flow_cache_genid = ATOMIC_INIT(0);
+static struct flow_cache flow_cache_global;
+static struct kmem_cache *flow_cachep;
+
+#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+	struct flow_cache *fc = (void *) arg;
 	int i;
 
 	for_each_possible_cpu(i)
-		flow_hash_rnd_recalc(i) = 1;
+		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
 
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 }
 
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache *fc,
+			    struct flow_cache_percpu *fcp,
+			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
 		atomic_dec(fle->object_ref);
 	kmem_cache_free(flow_cachep, fle);
-	flow_count(cpu)--;
+	fcp->hash_count--;
 }
 
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void __flow_cache_shrink(struct flow_cache *fc,
+				struct flow_cache_percpu *fcp,
+				int shrink_to)
 {
 	struct flow_cache_entry *fle, **flp;
 	int i;
 
-	for (i = 0; i < flow_hash_size; i++) {
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int k = 0;
 
-		flp = &flow_table(cpu)[i];
+		flp = &fcp->hash_table[i];
 		while ((fle = *flp) != NULL && k < shrink_to) {
 			k++;
 			flp = &fle->next;
 		}
 		while ((fle = *flp) != NULL) {
 			*flp = fle->next;
-			flow_entry_kill(cpu, fle);
+			flow_entry_kill(fc, fcp, fle);
 		}
 	}
 }
 
-static void flow_cache_shrink(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	int shrink_to = flow_lwm / flow_hash_size;
+	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 
-	__flow_cache_shrink(cpu, shrink_to);
+	__flow_cache_shrink(fc, fcp, shrink_to);
 }
 
-static void flow_new_hash_rnd(int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
-	flow_hash_rnd_recalc(cpu) = 0;
-
-	__flow_cache_shrink(cpu, 0);
+	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+	fcp->hash_rnd_recalc = 0;
+	__flow_cache_shrink(fc, fcp, 0);
 }
 
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static u32 flow_hash_code(struct flow_cache *fc,
+			  struct flow_cache_percpu *fcp,
+			  struct flowi *key)
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
-		(flow_hash_size - 1));
+	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1));
 }
 
 #if (BITS_PER_LONG == 64)
@@ -168,24 +169,25 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
+	struct flow_cache *fc = &flow_cache_global;
+	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
 	unsigned int hash;
-	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!fcp->hash_table)
 		goto nocache;
 
-	if (flow_hash_rnd_recalc(cpu))
-		flow_new_hash_rnd(cpu);
-	hash = flow_hash_code(key, cpu);
+	if (fcp->hash_rnd_recalc)
+		flow_new_hash_rnd(fc, fcp);
+	hash = flow_hash_code(fc, fcp, key);
 
-	head = &flow_table(cpu)[hash];
+	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
@@ -204,8 +206,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 	}
 
 	if (!fle) {
-		if (flow_count(cpu) > flow_hwm)
-			flow_cache_shrink(cpu);
+		if (fcp->hash_count > fc->high_watermark)
+			flow_cache_shrink(fc, fcp);
 
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
@@ -215,7 +217,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->dir = dir;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
-			flow_count(cpu)++;
+			fcp->hash_count++;
 		}
 	}
 
@@ -249,14 +251,15 @@ nocache:
 static void flow_cache_flush_tasklet(unsigned long data)
 {
 	struct flow_flush_info *info = (void *)data;
+	struct flow_cache *fc = info->cache;
+	struct flow_cache_percpu *fcp;
 	int i;
-	int cpu;
 
-	cpu = smp_processor_id();
-	for (i = 0; i < flow_hash_size; i++) {
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		struct flow_cache_entry *fle;
 
-		fle = flow_table(cpu)[i];
+		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
 			unsigned genid = atomic_read(&flow_cache_genid);
 
@@ -272,7 +275,6 @@ static void flow_cache_flush_tasklet(unsigned long data)
 		complete(&info->completion);
 }
 
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
 	struct flow_flush_info *info = data;
@@ -280,8 +282,7 @@ static void flow_cache_flush_per_cpu(void *data)
 	struct tasklet_struct *tasklet;
 
 	cpu = smp_processor_id();
-
-	tasklet = flow_flush_tasklet(cpu);
+	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
 	tasklet->data = (unsigned long)info;
 	tasklet_schedule(tasklet);
 }
@@ -294,6 +295,7 @@ void flow_cache_flush(void)
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
+	info.cache = &flow_cache_global;
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
 
@@ -307,62 +309,75 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+					  struct flow_cache_percpu *fcp)
 {
-	struct tasklet_struct *tasklet;
-	unsigned long order;
-
-	for (order = 0;
-	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_hash_size);
-	     order++)
-		/* NOTHING */;
-
-	flow_table(cpu) = (struct flow_cache_entry **)
-		__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-	if (!flow_table(cpu))
-		panic("NET: failed to allocate flow cache order %lu\n", order);
-
-	flow_hash_rnd_recalc(cpu) = 1;
-	flow_count(cpu) = 0;
-
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
+	fcp->hash_table = (struct flow_cache_entry **)
+		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+	if (!fcp->hash_table)
+		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
+
+	fcp->hash_rnd_recalc = 1;
+	fcp->hash_count = 0;
+	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
 }
 
 static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
+	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	int cpu = (unsigned long) hcpu;
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		__flow_cache_shrink((unsigned long)hcpu, 0);
+		__flow_cache_shrink(fc, fcp, 0);
 	return NOTIFY_OK;
 }
 
-static int __init flow_cache_init(void)
+static int flow_cache_init(struct flow_cache *fc)
 {
+	unsigned long order;
 	int i;
 
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC,
-					NULL);
-	flow_hash_shift = 10;
-	flow_lwm = 2 * flow_hash_size;
-	flow_hwm = 4 * flow_hash_size;
+	fc->hash_shift = 10;
+	fc->low_watermark = 2 * flow_cache_hash_size(fc);
+	fc->high_watermark = 4 * flow_cache_hash_size(fc);
+
+	for (order = 0;
+	     (PAGE_SIZE << order) <
+		     (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc));
+	     order++)
+		/* NOTHING */;
+	fc->order = order;
+	fc->percpu = alloc_percpu(struct flow_cache_percpu);
 
-	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+		    (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 
 	for_each_possible_cpu(i)
-		flow_cache_cpu_prepare(i);
+		flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
+
+	fc->hotcpu_notifier = (struct notifier_block){
+		.notifier_call = flow_cache_cpu,
+	};
+	register_hotcpu_notifier(&fc->hotcpu_notifier);
 
-	hotcpu_notifier(flow_cache_cpu, 0);
 	return 0;
 }
 
-module_init(flow_cache_init);
+static int __init flow_cache_init_global(void)
+{
+	flow_cachep = kmem_cache_create("flow_cache",
+					sizeof(struct flow_cache_entry),
+					0, SLAB_PANIC, NULL);
+
+	return flow_cache_init(&flow_cache_global);
+}
+
+module_init(flow_cache_init_global);
 
 EXPORT_SYMBOL(flow_cache_genid);
 EXPORT_SYMBOL(flow_cache_lookup);
-- 
cgit v1.2.3-70-g09d2


From fe1a5f031e76bd8761a7803d75b95ee96e84a574 Mon Sep 17 00:00:00 2001
From: Timo Teräs <timo.teras@iki.fi>
Date: Wed, 7 Apr 2010 00:30:04 +0000
Subject: flow: virtualize flow cache entry methods

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h     |  23 +++++++--
 include/net/xfrm.h     |   2 +
 net/core/flow.c        | 128 ++++++++++++++++++++++++++-----------------------
 net/xfrm/xfrm_policy.c | 112 ++++++++++++++++++++++++++++---------------
 4 files changed, 163 insertions(+), 102 deletions(-)

(limited to 'net/core/flow.c')

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b7dfe..bb08692a20b 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,26 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
+struct flow_cache_ops;
+
+struct flow_cache_object {
+	const struct flow_cache_ops *ops;
+};
+
+struct flow_cache_ops {
+	struct flow_cache_object *(*get)(struct flow_cache_object *);
+	int (*check)(struct flow_cache_object *);
+	void (*delete)(struct flow_cache_object *);
+};
+
+typedef struct flow_cache_object *(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_object *oldobj, void *ctx);
+
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080ba6c..35396e2dd1d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_object flo;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6b421..521df52a77d 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,17 +26,16 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry		*next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct flow_cache_entry		**hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -44,7 +43,7 @@ struct flow_cache_percpu {
 };
 
 struct flow_flush_info {
-	struct flow_cache *		cache;
+	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
@@ -52,7 +51,7 @@ struct flow_flush_info {
 struct flow_cache {
 	u32				hash_shift;
 	unsigned long			order;
-	struct flow_cache_percpu *	percpu;
+	struct flow_cache_percpu	*percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
-		atomic_dec(fle->object_ref);
+		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_object *flo;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	flo = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,27 +198,17 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
-	if (!fle) {
+	if (unlikely(!fle)) {
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
@@ -219,33 +222,39 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->object = NULL;
 			fcp->hash_count++;
 		}
+	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
+	} else if (fle->object) {
+	        flo = fle->object;
+	        flo->ops->delete(flo);
+	        fle->object = NULL;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	flo = NULL;
+	if (fle) {
+		flo = fle->object;
+		fle->object = NULL;
+	}
+	flo = resolver(net, key, family, dir, flo, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (!IS_ERR(flo))
+			fle->object = flo;
+		else
+			fle->genid--;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
 	}
+ret_object:
+	local_bh_enable();
+	return flo;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +270,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
+			if (fle->object)
+				fle->object->ops->delete(fle->object);
 			fle->object = NULL;
-			atomic_dec(fle->object_ref);
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf1c63..7722baeb140 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +688,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +728,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +845,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +998,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1116,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1601,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_object *flo;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(flo);
+		if (IS_ERR(flo)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (flo)
+			policy = container_of(flo, struct xfrm_policy, flo);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1968,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
cgit v1.2.3-70-g09d2


From 8e4795605d1e1b39113818ad7c147b8a867a1f6a Mon Sep 17 00:00:00 2001
From: Timo Teräs <timo.teras@iki.fi>
Date: Wed, 7 Apr 2010 00:30:07 +0000
Subject: flow: delayed deletion of flow cache entries

Speed up lookups by freeing flow cache entries later. After
virtualizing flow cache entry operations, the flow cache may now
end up calling policy or bundle destructor which can be slowish.

As gc_list is more effective with double linked list, the flow cache
is converted to use common hlist and list macroes where appropriate.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow.c | 100 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 31 deletions(-)

(limited to 'net/core/flow.c')

diff --git a/net/core/flow.c b/net/core/flow.c
index 521df52a77d..16190067400 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,7 +26,10 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry		*next;
+	union {
+		struct hlist_node	hlist;
+		struct list_head	gc_list;
+	} u;
 	u16				family;
 	u8				dir;
 	u32				genid;
@@ -35,7 +38,7 @@ struct flow_cache_entry {
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry		**hash_table;
+	struct hlist_head		*hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -62,6 +65,9 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0);
 static struct flow_cache flow_cache_global;
 static struct kmem_cache *flow_cachep;
 
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
+static LIST_HEAD(flow_cache_gc_list);
+
 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
@@ -86,38 +92,66 @@ static int flow_entry_valid(struct flow_cache_entry *fle)
 	return 1;
 }
 
-static void flow_entry_kill(struct flow_cache *fc,
-			    struct flow_cache_percpu *fcp,
-			    struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle)
 {
 	if (fle->object)
 		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
-	fcp->hash_count--;
+}
+
+static void flow_cache_gc_task(struct work_struct *work)
+{
+	struct list_head gc_list;
+	struct flow_cache_entry *fce, *n;
+
+	INIT_LIST_HEAD(&gc_list);
+	spin_lock_bh(&flow_cache_gc_lock);
+	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&flow_cache_gc_lock);
+
+	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+		flow_entry_kill(fce);
+}
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+				     int deleted, struct list_head *gc_list)
+{
+	if (deleted) {
+		fcp->hash_count -= deleted;
+		spin_lock_bh(&flow_cache_gc_lock);
+		list_splice_tail(gc_list, &flow_cache_gc_list);
+		spin_unlock_bh(&flow_cache_gc_lock);
+		schedule_work(&flow_cache_gc_work);
+	}
 }
 
 static void __flow_cache_shrink(struct flow_cache *fc,
 				struct flow_cache_percpu *fcp,
 				int shrink_to)
 {
-	struct flow_cache_entry *fle, **flp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int saved = 0;
 
-		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (saved < shrink_to &&
 			    flow_entry_valid(fle)) {
 				saved++;
-				flp = &fle->next;
 			} else {
-				*flp = fle->next;
-				flow_entry_kill(fc, fcp, fle);
+				deleted++;
+				hlist_del(&fle->u.hlist);
+				list_add_tail(&fle->u.gc_list, &gc_list);
 			}
 		}
 	}
+
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
 }
 
 static void flow_cache_shrink(struct flow_cache *fc,
@@ -182,7 +216,8 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry *fle, *tfle;
+	struct hlist_node *entry;
 	struct flow_cache_object *flo;
 	unsigned int hash;
 
@@ -200,12 +235,13 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 		flow_new_hash_rnd(fc, fcp);
 
 	hash = flow_hash_code(fc, fcp, key);
-	head = &fcp->hash_table[hash];
-	for (fle = *head; fle; fle = fle->next) {
-		if (fle->family == family &&
-		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0)
+	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
+		if (tfle->family == family &&
+		    tfle->dir == dir &&
+		    flow_key_compare(key, &tfle->key) == 0) {
+			fle = tfle;
 			break;
+		}
 	}
 
 	if (unlikely(!fle)) {
@@ -214,12 +250,11 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
-			fle->next = *head;
-			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
+			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 			fcp->hash_count++;
 		}
 	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
@@ -262,23 +297,26 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	struct flow_flush_info *info = (void *)data;
 	struct flow_cache *fc = info->cache;
 	struct flow_cache_percpu *fcp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		struct flow_cache_entry *fle;
-
-		fle = fcp->hash_table[i];
-		for (; fle; fle = fle->next) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (flow_entry_valid(fle))
 				continue;
 
-			if (fle->object)
-				fle->object->ops->delete(fle->object);
-			fle->object = NULL;
+			deleted++;
+			hlist_del(&fle->u.hlist);
+			list_add_tail(&fle->u.gc_list, &gc_list);
 		}
 	}
 
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
@@ -320,7 +358,7 @@ void flow_cache_flush(void)
 static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
 					  struct flow_cache_percpu *fcp)
 {
-	fcp->hash_table = (struct flow_cache_entry **)
+	fcp->hash_table = (struct hlist_head *)
 		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
 	if (!fcp->hash_table)
 		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
@@ -354,7 +392,7 @@ static int flow_cache_init(struct flow_cache *fc)
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc));
+		     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
 	fc->order = order;
-- 
cgit v1.2.3-70-g09d2