From 6292d9aaf3047f1abd970bc64ab6d952eda258ac Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Wed, 1 Feb 2006 03:04:44 -0800
Subject: [PATCH] __cpuinit functions wrongly marked __meminit

__meminit has overzelously been modified and crept its way into marking
cpuup callbacks as __meminit.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df54e2fc8ee..44b4eb4202d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1799,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
-static int __meminit zone_batchsize(struct zone *zone)
+static int __cpuinit zone_batchsize(struct zone *zone)
 {
 	int batch;
 
@@ -1893,7 +1893,7 @@ static struct per_cpu_pageset
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
-static int __meminit process_zones(int cpu)
+static int __cpuinit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 
@@ -1934,7 +1934,7 @@ static inline void free_zone_pagesets(int cpu)
 	}
 }
 
-static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
-- 
cgit v1.2.3-70-g09d2


From 8928862398fef04a137e5673ac5fa9e797960c87 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:25 -0800
Subject: [PATCH] Optimize off-node performance of zone reclaim

Ensure that the performance of off node pages stays the same as before.
Off node pagefault tests showed an 18% drop in performance without this
patch.

- Increase the timeout to 30 seconds to reduce the overhead.

- Move all code possible out of the off node hot path for zone reclaim
  (Sorry Andrew, the struct initialization had to be sacrificed).
  The read_page_state() bit us there.

- Check first for the timeout before any other checks.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c..465bfa54dfd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1589,24 +1589,20 @@ int zone_reclaim_mode __read_mostly;
 /*
  * Mininum time between zone reclaim scans
  */
-#define ZONE_RECLAIM_INTERVAL HZ/2
+#define ZONE_RECLAIM_INTERVAL 30*HZ
 /*
  * Try to free up some pages from this zone through reclaim.
  */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages = 1 << order;
+	int nr_pages;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc = {
-		.gfp_mask	= gfp_mask,
-		.may_writepage	= 0,
-		.may_swap	= 0,
-		.nr_mapped	= read_page_state(nr_mapped),
-		.nr_scanned	= 0,
-		.nr_reclaimed	= 0,
-		.priority	= 0
-	};
+	struct scan_control sc;
+
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
 		zone->zone_pgdat->node_id != numa_node_id() ||
@@ -1614,12 +1610,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		atomic_read(&zone->reclaim_in_progress) > 0)
 			return 0;
 
-	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
-			return 0;
+	sc.may_writepage = 0;
+	sc.may_swap = 0;
+	sc.nr_scanned = 0;
+	sc.nr_reclaimed = 0;
+	sc.priority = 0;
+	sc.nr_mapped = read_page_state(nr_mapped);
+	sc.gfp_mask = gfp_mask;
 
 	disable_swap_token();
 
+	nr_pages = 1 << order;
 	if (nr_pages > SWAP_CLUSTER_MAX)
 		sc.swap_cluster_max = nr_pages;
 	else
-- 
cgit v1.2.3-70-g09d2


From 42c722d4cb4022e56ff200f3f5a58c0dfd7edac6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:26 -0800
Subject: [PATCH] zone_reclaim: reclaim on memory only node support

Zone reclaim is usually only run on the local node.  Headless nodes do not
have any local processors.  This patch checks for headless nodes and
performs zone reclaim on them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 465bfa54dfd..0ca6007d655 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1599,17 +1599,23 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc;
+	cpumask_t mask;
+	int node_id;
 
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
 			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
-		zone->zone_pgdat->node_id != numa_node_id() ||
 		zone->all_unreclaimable ||
 		atomic_read(&zone->reclaim_in_progress) > 0)
 			return 0;
 
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+
 	sc.may_writepage = 0;
 	sc.may_swap = 0;
 	sc.nr_scanned = 0;
-- 
cgit v1.2.3-70-g09d2


From 52a8363eae3872af15880292ff4e06d0fab36986 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:28 -0800
Subject: [PATCH] mm: improve function of sc->may_writepage

Make sc->may_writepage control the writeout behavior of shrink_list.

Remove the laptop_mode trick from shrink_list and instead set may_writepage
in try_to_free_pages properly.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0ca6007d655..a29efb2c06c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -492,7 +492,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
-			if (laptop_mode && !sc->may_writepage)
+			if (!sc->may_writepage)
 				goto keep_locked;
 
 			/* Page is dirty, try to write it out here */
@@ -1170,7 +1170,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int i;
 
 	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = 0;
+	sc.may_writepage = !laptop_mode;
 	sc.may_swap = 1;
 
 	inc_page_state(allocstall);
@@ -1273,7 +1273,7 @@ loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
 	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = 0;
+	sc.may_writepage = !laptop_mode;
 	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
-- 
cgit v1.2.3-70-g09d2


From c84db23c6e587d3ab00a41c51fedf758e1f6ecd4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:29 -0800
Subject: [PATCH] zone_reclaim: minor fixes

- If we only reclaim nr_pages then its okay to stay on node.
  Switch from > to >= for the comparison.

- vm_table[] entry for zone_reclaim_mode is a bit screwed up.

- Add empty lines around shrink_zone to show that this is the
  central function to be called.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 3 ++-
 mm/vmscan.c     | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb99a42f8b3..c74f03bc014 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -878,7 +878,8 @@ static ctl_table vm_table[] = {
 		.maxlen		= sizeof(zone_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &zero,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
 	},
 #endif
 	{ .ctl_name = 0 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a29efb2c06c..61ca0097c83 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1636,14 +1636,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
+
 	shrink_zone(zone, &sc);
+
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
 
 	if (sc.nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	return sc.nr_reclaimed > nr_pages;
+	return sc.nr_reclaimed >= nr_pages;
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 9884fd8df195fe48d4e1be2279b419be96127cae Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@linux.intel.com>
Date: Wed, 1 Feb 2006 03:05:30 -0800
Subject: [PATCH] Use 32 bit division in slab_put_obj()

Improve the performance of slab_put_obj().  Without the cast, gcc considers
ptrdiff_t a 64 bit signed integer and ends up emitting code to use a full
signed 128 bit divide on EM64T, which is substantially slower than a 32 bit
unsigned divide.

I noticed this when looking at the profile of a case where the slab balance
is just on edge and thrashes back and forth freeing a block.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6f8495e2185..88082ae1573 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1398,7 +1398,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		struct slab *slabp = page_get_slab(virt_to_page(objp));
 		int objnr;
 
-		objnr = (objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
 		if (objnr) {
 			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
 			realobj = (char *)objp + obj_dbghead(cachep);
@@ -2341,7 +2341,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (objp - slabp->s_mem) / cachep->objsize;
+	objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
 
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
@@ -2699,7 +2699,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 		slabp = page_get_slab(virt_to_page(objp));
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 
-- 
cgit v1.2.3-70-g09d2


From aa3f18b3391ac305baa01faead3fdf9147daf54b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:32 -0800
Subject: [PATCH] zone_reclaim: do not unmap file backed pages

zone_reclaim should leave that to the real swapper.  We are only interested
in evicting unmapped pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 61ca0097c83..8277f93148b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,6 +477,12 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
+			/*
+			 * No unmapping if we do not swap
+			 */
+			if (!sc->may_swap)
+				goto keep_locked;
+
 			switch (try_to_unmap(page)) {
 			case SWAP_FAIL:
 				goto activate_locked;
-- 
cgit v1.2.3-70-g09d2


From a92f71263af9d0ab77c260f709c0c079656221aa Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:32 -0800
Subject: [PATCH] zone_reclaim: partial scans instead of full scan

Instead of scanning all the pages in a zone, imitate real swap and scan
only a portion of the pages and gradually scan more if we do not free up
enough pages.  This avoids a zone suddenly loosing all unused pagecache
pages (we may after all access some of these again so they deserve another
chance) but it still frees up large chunks of memory if a zone only
contains unused pagecache pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8277f93148b..f8b94ea6f72 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1596,6 +1596,14 @@ int zone_reclaim_mode __read_mostly;
  * Mininum time between zone reclaim scans
  */
 #define ZONE_RECLAIM_INTERVAL 30*HZ
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -1626,7 +1634,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	sc.may_swap = 0;
 	sc.nr_scanned = 0;
 	sc.nr_reclaimed = 0;
-	sc.priority = 0;
+	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 	sc.gfp_mask = gfp_mask;
 
@@ -1643,7 +1651,15 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	shrink_zone(zone, &sc);
+	/*
+	 * Free memory by calling shrink zone with increasing priorities
+	 * until we have enough memory freed.
+	 */
+	do {
+		sc.priority--;
+		shrink_zone(zone, &sc);
+
+	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
 
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
-- 
cgit v1.2.3-70-g09d2


From 2a11ff06d7d12be5d1bbcf592fff649b45ac2388 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:33 -0800
Subject: [PATCH] zone_reclaim: configurable off node allocation period.

Currently the zone_reclaim code has a fixed window of 30 seconds of off node
allocations should a local zone have no unused pagecache pages left.  Reclaim
will be attempted again after this timeout period to avoid repeated useless
scans for memory.  This is also useful to established sufficiently large off
node allocation chunks to relieve the local node.

It may be beneficial to adjust that time period for some special situations.
For example if memory use was exceeding node capacity one may want to give up
for longer periods of time.  If memory spikes intermittendly then one may want
to shorten the time period to reduce the number of off node allocations.

This patch allows just that....

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 12 ++++++++++++
 include/linux/swap.h        |  1 +
 include/linux/sysctl.h      |  3 ++-
 kernel/sysctl.c             |  9 +++++++++
 mm/vmscan.c                 |  4 ++--
 5 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 391dd64363e..44518c02394 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
 - block_dump
 - drop-caches
 - zone_reclaim_mode
+- zone_reclaim_interval
 
 ==============================================================
 
@@ -137,4 +138,15 @@ of memory should be used for caching files from disk.
 
 It may be beneficial to switch this on if one wants to do zone
 reclaim regardless of the numa distances in the system.
+================================================================
+
+zone_reclaim_interval:
+
+The time allowed for off node allocations after zone reclaim
+has failed to reclaim enough pages to allow a local allocation.
+
+Time is set in seconds and set by default to 30 seconds.
+
+Reduce the interval if undesired off node allocations occur. However, too
+frequent scans will have a negative impact onoff node allocation performance.
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4a99e4a7fbf..e53fef7051e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,6 +178,7 @@ extern int vm_swappiness;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
+extern int zone_reclaim_interval;
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 #else
 #define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 8352a7ce589..32a4139c4ad 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -182,7 +182,8 @@ enum
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
-	VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */
+	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
+	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c74f03bc014..71dd6f62efe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -881,6 +881,15 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_ZONE_RECLAIM_INTERVAL,
+		.procname	= "zone_reclaim_interval",
+		.data		= &zone_reclaim_interval,
+		.maxlen		= sizeof(zone_reclaim_interval),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8b94ea6f72..8760a4abfa1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1595,7 +1595,7 @@ int zone_reclaim_mode __read_mostly;
 /*
  * Mininum time between zone reclaim scans
  */
-#define ZONE_RECLAIM_INTERVAL 30*HZ
+int zone_reclaim_interval __read_mostly = 30*HZ;
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1617,7 +1617,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
 			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
-- 
cgit v1.2.3-70-g09d2


From 1b2ffb7896ad46067f5b9ebf7de1891d74a4cdef Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:34 -0800
Subject: [PATCH] Zone reclaim: Allow modification of zone reclaim behavior

In some situations one may want zone_reclaim to behave differently.  For
example a process writing large amounts of memory will spew unto other nodes
to cache the writes if many pages in a zone become dirty.  This may impact the
performance of processes running on other nodes.

Allowing writes during reclaim puts a stop to that behavior and throttles the
process by restricting the pages to the local zone.

Similarly one may want to contain processes to local memory by enabling
regular swap behavior during zone_reclaim.  Off node memory allocation can
then be controlled through memory policies and cpusets.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 38 ++++++++++++++++++++++++++++++--------
 mm/vmscan.c                 |  9 +++++++--
 2 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 44518c02394..4bca2a3d917 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -127,17 +127,39 @@ the high water marks for each per cpu page list.
 
 zone_reclaim_mode:
 
-This is set during bootup to 1 if it is determined that pages from
-remote zones will cause a significant performance reduction. The
+Zone_reclaim_mode allows to set more or less agressive approaches to
+reclaim memory when a zone runs out of memory. If it is set to zero then no
+zone reclaim occurs. Allocations will be satisfied from other zones / nodes
+in the system.
+
+This is value ORed together of
+
+1	= Zone reclaim on
+2	= Zone reclaim writes dirty pages out
+4	= Zone reclaim swaps pages
+
+zone_reclaim_mode is set during bootup to 1 if it is determined that pages
+from remote zones will cause a measurable performance reduction. The
 page allocator will then reclaim easily reusable pages (those page
-cache pages that are currently not used) before going off node.
+cache pages that are currently not used) before allocating off node pages.
+
+It may be beneficial to switch off zone reclaim if the system is
+used for a file server and all of memory should be used for caching files
+from disk. In that case the caching effect is more important than
+data locality.
+
+Allowing zone reclaim to write out pages stops processes that are
+writing large amounts of data from dirtying pages on other nodes. Zone
+reclaim will write out dirty pages if a zone fills up and so effectively
+throttle the process. This may decrease the performance of a single process
+since it cannot use all of system memory to buffer the outgoing writes
+anymore but it preserve the memory on other nodes so that the performance
+of other processes running on other nodes will not be affected.
 
-The user can override this setting. It may be beneficial to switch
-off zone reclaim if the system is used for a file server and all
-of memory should be used for caching files from disk.
+Allowing regular swap effectively restricts allocations to the local
+node unless explicitly overridden by memory policies or cpuset
+configurations.
 
-It may be beneficial to switch this on if one wants to do zone
-reclaim regardless of the numa distances in the system.
 ================================================================
 
 zone_reclaim_interval:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8760a4abfa1..9e2ef3624d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1592,6 +1592,11 @@ module_init(kswapd_init)
  */
 int zone_reclaim_mode __read_mostly;
 
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+
 /*
  * Mininum time between zone reclaim scans
  */
@@ -1630,8 +1635,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
 
-	sc.may_writepage = 0;
-	sc.may_swap = 0;
+	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
+	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
 	sc.nr_scanned = 0;
 	sc.nr_reclaimed = 0;
 	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
-- 
cgit v1.2.3-70-g09d2


From 2a16e3f4b0c408b9e50297d2ec27e295d490267a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:35 -0800
Subject: [PATCH] Reclaim slab during zone reclaim

If large amounts of zone memory are used by empty slabs then zone_reclaim
becomes uneffective.  This patch shakes the slab a bit.

The problem with this patch is that the slab reclaim is not containable to a
zone.  Thus slab reclaim may affect the whole system and be extremely slow.
This also means that we cannot determine how many pages were freed in this
zone.  Thus we need to go off node for at least one allocation.

The functionality is disabled by default.

We could modify the shrinkers to take a zone parameter but that would be quite
invasive.  Better ideas are welcome.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt |  6 ++++++
 mm/vmscan.c                 | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'mm')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 4bca2a3d917..a46c10fcddf 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -137,6 +137,7 @@ This is value ORed together of
 1	= Zone reclaim on
 2	= Zone reclaim writes dirty pages out
 4	= Zone reclaim swaps pages
+8	= Also do a global slab reclaim pass
 
 zone_reclaim_mode is set during bootup to 1 if it is determined that pages
 from remote zones will cause a measurable performance reduction. The
@@ -160,6 +161,11 @@ Allowing regular swap effectively restricts allocations to the local
 node unless explicitly overridden by memory policies or cpuset
 configurations.
 
+It may be advisable to allow slab reclaim if the system makes heavy
+use of files and builds up large slab caches. However, the slab
+shrink operation is global, may take a long time and free slabs
+in all nodes of the system.
+
 ================================================================
 
 zone_reclaim_interval:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9e2ef3624d7..aa4b80dbe3a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1596,6 +1596,7 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
 
 /*
  * Mininum time between zone reclaim scans
@@ -1666,6 +1667,19 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
 
+	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+		/*
+		 * shrink_slab does not currently allow us to determine
+		 * how many pages were freed in the zone. So we just
+		 * shake the slab and then go offnode for a single allocation.
+		 *
+		 * shrink_slab will free memory on all zones and may take
+		 * a long time.
+		 */
+		shrink_slab(sc.nr_scanned, gfp_mask, order);
+		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
+	}
+
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
 
-- 
cgit v1.2.3-70-g09d2


From b16664e44c54525be89dc07ad15a13b4eeec5634 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:36 -0800
Subject: [PATCH] Direct Migration V9: PageSwapCache checks

Check for PageSwapCache after looking up and locking a swap page.

The page migration code may change a swap pte to point to a different page
under lock_page().

If that happens then the vm must retry the lookup operation in the swap space
to find the correct page number.  There are a couple of locations in the VM
where a lock_page() is done on a swap page.  In these locations we need to
check afterwards if the page was migrated.  If the page was migrated then the
old page that was looked up before was freed and no longer has the
PageSwapCache bit set.

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c   | 7 +++++++
 mm/shmem.c    | 8 ++++++++
 mm/swapfile.c | 7 +++++++
 3 files changed, 22 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 7a11ddd5060..2bee1f21aa8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1871,6 +1871,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
+again:
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1894,6 +1895,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	mark_page_accessed(page);
 	lock_page(page);
+	if (!PageSwapCache(page)) {
+		/* Page migration has occured */
+		unlock_page(page);
+		page_cache_release(page);
+		goto again;
+	}
 
 	/*
 	 * Back out if somebody else already faulted in this pte.
diff --git a/mm/shmem.c b/mm/shmem.c
index ce501bce1c2..f7ac7b812f9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1028,6 +1028,14 @@ repeat:
 			page_cache_release(swappage);
 			goto repeat;
 		}
+		if (!PageSwapCache(swappage)) {
+			/* Page migration has occured */
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			unlock_page(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
 		if (PageWriteback(swappage)) {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1e69c30d20..9678182e0ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -646,6 +646,7 @@ static int try_to_unuse(unsigned int type)
 		 */
 		swap_map = &si->swap_map[i];
 		entry = swp_entry(type, i);
+again:
 		page = read_swap_cache_async(entry, NULL, 0);
 		if (!page) {
 			/*
@@ -680,6 +681,12 @@ static int try_to_unuse(unsigned int type)
 		wait_on_page_locked(page);
 		wait_on_page_writeback(page);
 		lock_page(page);
+		if (!PageSwapCache(page)) {
+			/* Page migration has occured */
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
 		wait_on_page_writeback(page);
 
 		/*
-- 
cgit v1.2.3-70-g09d2


From a48d07afdf18212de22b959715b16793c5a6e57a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:38 -0800
Subject: [PATCH] Direct Migration V9: migrate_pages() extension

Add direct migration support with fall back to swap.

Direct migration support on top of the swap based page migration facility.

This allows the direct migration of anonymous pages and the migration of file
backed pages by dropping the associated buffers (requires writeout).

Fall back to swap out if necessary.

The patch is based on lots of patches from the hotplug project but the code
was restructured, documented and simplified as much as possible.

Note that an additional patch that defines the migrate_page() method for
filesystems is necessary in order to avoid writeback for anonymous and file
backed pages.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/vm/page_migration | 129 +++++++++++++++++++++++
 include/linux/rmap.h            |   4 +-
 include/linux/swap.h            |   2 +
 mm/rmap.c                       |  21 ++--
 mm/vmscan.c                     | 226 ++++++++++++++++++++++++++++++++++++++--
 5 files changed, 360 insertions(+), 22 deletions(-)
 create mode 100644 Documentation/vm/page_migration

(limited to 'mm')

diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
new file mode 100644
index 00000000000..c52820fcf50
--- /dev/null
+++ b/Documentation/vm/page_migration
@@ -0,0 +1,129 @@
+Page migration
+--------------
+
+Page migration allows the moving of the physical location of pages between
+nodes in a numa system while the process is running. This means that the
+virtual addresses that the process sees do not change. However, the
+system rearranges the physical location of those pages.
+
+The main intend of page migration is to reduce the latency of memory access
+by moving pages near to the processor where the process accessing that memory
+is running.
+
+Page migration allows a process to manually relocate the node on which its
+pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
+a new memory policy. The pages of process can also be relocated
+from another process using the sys_migrate_pages() function call. The
+migrate_pages function call takes two sets of nodes and moves pages of a
+process that are located on the from nodes to the destination nodes.
+
+Manual migration is very useful if for example the scheduler has relocated
+a process to a processor on a distant node. A batch scheduler or an
+administrator may detect the situation and move the pages of the process
+nearer to the new processor. At some point in the future we may have
+some mechanism in the scheduler that will automatically move the pages.
+
+Larger installations usually partition the system using cpusets into
+sections of nodes. Paul Jackson has equipped cpusets with the ability to
+move pages when a task is moved to another cpuset. This allows automatic
+control over locality of a process. If a task is moved to a new cpuset
+then also all its pages are moved with it so that the performance of the
+process does not sink dramatically (as is the case today).
+
+Page migration allows the preservation of the relative location of pages
+within a group of nodes for all migration techniques which will preserve a
+particular memory allocation pattern generated even after migrating a
+process. This is necessary in order to preserve the memory latencies.
+Processes will run with similar performance after migration.
+
+Page migration occurs in several steps. First a high level
+description for those trying to use migrate_pages() and then
+a low level description of how the low level details work.
+
+A. Use of migrate_pages()
+-------------------------
+
+1. Remove pages from the LRU.
+
+   Lists of pages to be migrated are generated by scanning over
+   pages and moving them into lists. This is done by
+   calling isolate_lru_page() or __isolate_lru_page().
+   Calling isolate_lru_page increases the references to the page
+   so that it cannot vanish under us.
+
+2. Generate a list of newly allocates page to move the contents
+   of the first list to.
+
+3. The migrate_pages() function is called which attempts
+   to do the migration. It returns the moved pages in the
+   list specified as the third parameter and the failed
+   migrations in the fourth parameter. The first parameter
+   will contain the pages that could still be retried.
+
+4. The leftover pages of various types are returned
+   to the LRU using putback_to_lru_pages() or otherwise
+   disposed of. The pages will still have the refcount as
+   increased by isolate_lru_pages()!
+
+B. Operation of migrate_pages()
+--------------------------------
+
+migrate_pages does several passes over its list of pages. A page is moved
+if all references to a page are removable at the time.
+
+Steps:
+
+1. Lock the page to be migrated
+
+2. Insure that writeback is complete.
+
+3. Make sure that the page has assigned swap cache entry if
+   it is an anonyous page. The swap cache reference is necessary
+   to preserve the information contain in the page table maps.
+
+4. Prep the new page that we want to move to. It is locked
+   and set to not being uptodate so that all accesses to the new
+   page immediately lock while we are moving references.
+
+5. All the page table references to the page are either dropped (file backed)
+   or converted to swap references (anonymous pages). This should decrease the
+   reference count.
+
+6. The radix tree lock is taken
+
+7. The refcount of the page is examined and we back out if references remain
+   otherwise we know that we are the only one referencing this page.
+
+8. The radix tree is checked and if it does not contain the pointer to this
+   page then we back out.
+
+9. The mapping is checked. If the mapping is gone then a truncate action may
+   be in progress and we back out.
+
+10. The new page is prepped with some settings from the old page so that accesses
+   to the new page will be discovered to have the correct settings.
+
+11. The radix tree is changed to point to the new page.
+
+12. The reference count of the old page is dropped because the reference has now
+    been removed.
+
+13. The radix tree lock is dropped.
+
+14. The page contents are copied to the new page.
+
+15. The remaining page flags are copied to the new page.
+
+16. The old page flags are cleared to indicate that the page does
+    not use any information anymore.
+
+17. Queued up writeback on the new page is triggered.
+
+18. If swap pte's were generated for the page then remove them again.
+
+19. The locks are dropped from the old and new page.
+
+20. The new page is moved to the LRU.
+
+Christoph Lameter, December 19, 2005.
+
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9d6fbeef210..0f1ea2d6ed8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -91,7 +91,7 @@ static inline void page_dup_rmap(struct page *page)
  * Called from mm/vmscan.c to handle paging out
  */
 int page_referenced(struct page *, int is_locked);
-int try_to_unmap(struct page *);
+int try_to_unmap(struct page *, int ignore_refs);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
@@ -111,7 +111,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 #define anon_vma_link(vma)	do {} while (0)
 
 #define page_referenced(page,l) TestClearPageReferenced(page)
-#define try_to_unmap(page)	SWAP_FAIL
+#define try_to_unmap(page, refs) SWAP_FAIL
 
 #endif	/* CONFIG_MMU */
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e53fef7051e..d359fc02243 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -191,6 +191,8 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
+extern int migrate_page(struct page *, struct page *);
+extern void migrate_page_copy(struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 #else
diff --git a/mm/rmap.c b/mm/rmap.c
index d85a99d28c0..13fad5fcdf7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -52,6 +52,7 @@
 #include <linux/init.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
+#include <linux/module.h>
 
 #include <asm/tlbflush.h>
 
@@ -541,7 +542,8 @@ void page_remove_rmap(struct page *page)
  * Subfunctions of try_to_unmap: try_to_unmap_one called
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+				int ignore_refs)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -564,7 +566,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if ((vma->vm_flags & VM_LOCKED) ||
-			ptep_clear_flush_young(vma, address, pte)) {
+			(ptep_clear_flush_young(vma, address, pte)
+				&& !ignore_refs)) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -698,7 +701,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
-static int try_to_unmap_anon(struct page *page)
+static int try_to_unmap_anon(struct page *page, int ignore_refs)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -709,7 +712,7 @@ static int try_to_unmap_anon(struct page *page)
 		return ret;
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		ret = try_to_unmap_one(page, vma);
+		ret = try_to_unmap_one(page, vma, ignore_refs);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
 	}
@@ -726,7 +729,7 @@ static int try_to_unmap_anon(struct page *page)
  *
  * This function is only called from try_to_unmap for object-based pages.
  */
-static int try_to_unmap_file(struct page *page)
+static int try_to_unmap_file(struct page *page, int ignore_refs)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -740,7 +743,7 @@ static int try_to_unmap_file(struct page *page)
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		ret = try_to_unmap_one(page, vma);
+		ret = try_to_unmap_one(page, vma, ignore_refs);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
 	}
@@ -825,16 +828,16 @@ out:
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  */
-int try_to_unmap(struct page *page)
+int try_to_unmap(struct page *page, int ignore_refs)
 {
 	int ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page);
+		ret = try_to_unmap_anon(page, ignore_refs);
 	else
-		ret = try_to_unmap_file(page);
+		ret = try_to_unmap_file(page, ignore_refs);
 
 	if (!page_mapped(page))
 		ret = SWAP_SUCCESS;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aa4b80dbe3a..8f326ce2b69 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -483,7 +483,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 			if (!sc->may_swap)
 				goto keep_locked;
 
-			switch (try_to_unmap(page)) {
+			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -623,7 +623,7 @@ static int swap_page(struct page *page)
 	struct address_space *mapping = page_mapping(page);
 
 	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page) != SWAP_SUCCESS)
+		if (try_to_unmap(page, 0) != SWAP_SUCCESS)
 			goto unlock_retry;
 
 	if (PageDirty(page)) {
@@ -659,6 +659,154 @@ unlock_retry:
 retry:
 	return -EAGAIN;
 }
+
+/*
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+static int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return 1;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	try_to_unmap(page, 1);
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return 1;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 1;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	if (migrate_page_remove_references(newpage, page, 2))
+		return -EAGAIN;
+
+	migrate_page_copy(newpage, page);
+
+	return 0;
+}
+
 /*
  * migrate_pages
  *
@@ -672,11 +820,6 @@ retry:
  * are movable anymore because t has become empty
  * or no retryable pages exist anymore.
  *
- * SIMPLIFIED VERSION: This implementation of migrate_pages
- * is only swapping out pages and never touches the second
- * list. The direct migration patchset
- * extends this function to avoid the use of swap.
- *
  * Return: Number of pages not migrated when "to" ran empty.
  */
 int migrate_pages(struct list_head *from, struct list_head *to,
@@ -697,6 +840,9 @@ redo:
 	retry = 0;
 
 	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
 		cond_resched();
 
 		rc = 0;
@@ -704,6 +850,9 @@ redo:
 			/* page was freed from under us. So we are done. */
 			goto next;
 
+		if (to && list_empty(to))
+			break;
+
 		/*
 		 * Skip locked pages during the first two passes to give the
 		 * functions holding the lock time to release the page. Later we
@@ -740,12 +889,64 @@ redo:
 			}
 		}
 
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
 		/*
-		 * Page is properly locked and writeback is complete.
+		 * Pages are properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		rc = swap_page(page);
-		goto next;
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		/*
+		 * Trigger writeout if page is dirty
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+		/*
+		 * If we have no buffer or can release the buffer
+		 * then do a simple migration.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
 
 unlock_page:
 		unlock_page(page);
@@ -758,7 +959,10 @@ next:
 			list_move(&page->lru, failed);
 			nr_failed++;
 		} else {
-			/* Success */
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
 			list_move(&page->lru, moved);
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From a3351e525e4768c29aa5d22ef59b5b38e0361e53 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:39 -0800
Subject: [PATCH] Direct Migration V9: remove_from_swap() to remove swap ptes

Add remove_from_swap

remove_from_swap() allows the restoration of the pte entries that existed
before page migration occurred for anonymous pages by walking the reverse
maps.  This reduces swap use and establishes regular pte's without the need
for page faults.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h |  1 +
 include/linux/swap.h |  1 +
 mm/rmap.c            | 29 +++++++++++++++++++++++++++++
 mm/swapfile.c        |  9 +++++++++
 mm/vmscan.c          |  9 +++++++++
 5 files changed, 49 insertions(+)

(limited to 'mm')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0f1ea2d6ed8..d6b9bcd1384 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -92,6 +92,7 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *, int ignore_refs);
+void remove_from_swap(struct page *page);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d359fc02243..229b6d04b4b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -248,6 +248,7 @@ extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
 extern spinlock_t swap_lock;
+extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
diff --git a/mm/rmap.c b/mm/rmap.c
index 13fad5fcdf7..f4b91d7aa5c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -206,6 +206,35 @@ out:
 	return anon_vma;
 }
 
+#ifdef CONFIG_MIGRATION
+/*
+ * Remove an anonymous page from swap replacing the swap pte's
+ * through real pte's pointing to valid pages and then releasing
+ * the page from the swap cache.
+ *
+ * Must hold page lock on page.
+ */
+void remove_from_swap(struct page *page)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+
+	if (!PageAnon(page) || !PageSwapCache(page))
+		return;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return;
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+		remove_vma_swap(vma, page);
+
+	spin_unlock(&anon_vma->lock);
+
+	delete_from_swap_cache(page);
+}
+#endif
+
 /*
  * At what user virtual address is page expected in vma?
  */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9678182e0ee..1f9cf0d073b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -554,6 +554,15 @@ static int unuse_mm(struct mm_struct *mm,
 	return 0;
 }
 
+#ifdef CONFIG_MIGRATION
+int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page) };
+
+	return unuse_vma(vma, entry, page);
+}
+#endif
+
 /*
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8f326ce2b69..5e98b86feb7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -804,6 +804,15 @@ int migrate_page(struct page *newpage, struct page *page)
 
 	migrate_page_copy(newpage, page);
 
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7e2ab150d1b3b286a4c864c60a549b2601777b63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:40 -0800
Subject: [PATCH] Direct Migration V9: upgrade MPOL_MF_MOVE and
 sys_migrate_pages()

Modify policy layer to support direct page migration

- Add migrate_pages_to() allowing the migration of a list of pages to a a
  specified node or to vma with a specific allocation policy in sets of
  MIGRATE_CHUNK_SIZE pages

- Modify do_migrate_pages() to do a staged move of pages from the source
  nodes to the target nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 146 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0e..27da6d5c77b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -543,24 +546,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	}
 }
 
-static int swap_pages(struct list_head *pagelist)
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
 {
+	LIST_HEAD(newlist);
 	LIST_HEAD(moved);
 	LIST_HEAD(failed);
-	int n;
+	int err = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
 
-	n = migrate_pages(pagelist, NULL, &moved, &failed);
-	putback_lru_pages(&failed);
-	putback_lru_pages(&moved);
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma)
+			page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 
-	return n;
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE);
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+{
+	nodemask_t nmask;
+	LIST_HEAD(pagelist);
+	int err = 0;
+
+	nodes_clear(nmask);
+	node_set(source, nmask);
+
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+	if (!list_empty(&pagelist)) {
+		err = migrate_pages_to(&pagelist, NULL, dest);
+		if (!list_empty(&pagelist))
+			putback_lru_pages(&pagelist);
+	}
+	return err;
 }
 
 /*
- * For now migrate_pages simply swaps out the pages from nodes that are in
- * the source set but not in the target set. In the future, we would
- * want a function that moves pages between the two nodesets in such
- * a way as to preserve the physical layout as much as possible.
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
  *
  * Returns the number of page that could not be moved.
  */
@@ -568,22 +638,76 @@ int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
 	LIST_HEAD(pagelist);
-	int count = 0;
-	nodemask_t nodes;
+	int busy = 0;
+	int err = 0;
+	nodemask_t tmp;
 
-	nodes_andnot(nodes, *from_nodes, *to_nodes);
+  	down_read(&mm->mmap_sem);
 
-	down_read(&mm->mmap_sem);
-	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
-			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same.  If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient.  As we go, we remember the
+ * most recent <s, d> pair that moved (s != d).  If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved.  If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
 
-	if (!list_empty(&pagelist)) {
-		count = swap_pages(&pagelist);
-		putback_lru_pages(&pagelist);
+	tmp = *from_nodes;
+	while (!nodes_empty(tmp)) {
+		int s,d;
+		int source = -1;
+		int dest = 0;
+
+		for_each_node_mask(s, tmp) {
+			d = node_remap(s, *from_nodes, *to_nodes);
+			if (s == d)
+				continue;
+
+			source = s;	/* Node moved. Memorize */
+			dest = d;
+
+			/* dest not in remaining from nodes? */
+			if (!node_isset(dest, tmp))
+				break;
+		}
+		if (source == -1)
+			break;
+
+		node_clear(source, tmp);
+		err = migrate_to_node(mm, source, dest, flags);
+		if (err > 0)
+			busy += err;
+		if (err < 0)
+			break;
 	}
 
 	up_read(&mm->mmap_sem);
-	return count;
+	if (err < 0)
+		return err;
+	return busy;
 }
 
 long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +767,9 @@ long do_mbind(unsigned long start, unsigned long len,
 		int nr_failed = 0;
 
 		err = mbind_range(vma, start, end, new);
+
 		if (!list_empty(&pagelist))
-			nr_failed = swap_pages(&pagelist);
+			nr_failed = migrate_pages_to(&pagelist, vma, -1);
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
-- 
cgit v1.2.3-70-g09d2


From e965f9630c651fa4249039fd4b80c9392d07a856 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:41 -0800
Subject: [PATCH] Direct Migration V9: Avoid writeback / page_migrate() method

Migrate a page with buffers without requiring writeback

This introduces a new address space operation migratepage() that may be used
by a filesystem to implement its own version of page migration.

A version is provided that migrates buffers attached to pages.  Some
filesystems (ext2, ext3, xfs) are modified to utilize this feature.

The swapper address space operation are modified so that a regular
migrate_page() will occur for anonymous pages without writeback (migrate_pages
forces every anonymous page to have a swap entry).

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |  2 ++
 fs/ext3/inode.c             |  2 ++
 fs/xfs/linux-2.6/xfs_aops.c |  1 +
 fs/xfs/linux-2.6/xfs_buf.c  |  1 +
 include/linux/fs.h          |  8 ++++++
 include/linux/swap.h        |  5 ++++
 mm/rmap.c                   |  1 +
 mm/swap_state.c             |  1 +
 mm/vmscan.c                 | 20 ++++++++++++++-
 10 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3dc712f29d2..8bcbac87a28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3049,6 +3049,66 @@ asmlinkage long sys_bdflush(int func, long data)
 	return 0;
 }
 
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+#ifdef CONFIG_MIGRATION
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	if (migrate_page_remove_references(newpage, page, 3))
+		return -EAGAIN;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+#endif
+
 /*
  * Buffer-head allocation
  */
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e7d3f0522d0..a717837f272 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -706,6 +706,7 @@ struct address_space_operations ext2_aops = {
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
 };
 
 struct address_space_operations ext2_aops_xip = {
@@ -723,6 +724,7 @@ struct address_space_operations ext2_nobh_aops = {
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
 };
 
 /*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 8824e84f8a5..3fc4238e970 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1559,6 +1559,7 @@ static struct address_space_operations ext3_ordered_aops = {
 	.invalidatepage	= ext3_invalidatepage,
 	.releasepage	= ext3_releasepage,
 	.direct_IO	= ext3_direct_IO,
+	.migratepage	= buffer_migrate_page,
 };
 
 static struct address_space_operations ext3_writeback_aops = {
@@ -1572,6 +1573,7 @@ static struct address_space_operations ext3_writeback_aops = {
 	.invalidatepage	= ext3_invalidatepage,
 	.releasepage	= ext3_releasepage,
 	.direct_IO	= ext3_direct_IO,
+	.migratepage	= buffer_migrate_page,
 };
 
 static struct address_space_operations ext3_journalled_aops = {
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 12062678940..9892268e300 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1462,4 +1462,5 @@ struct address_space_operations linvfs_aops = {
 	.commit_write		= generic_commit_write,
 	.bmap			= linvfs_bmap,
 	.direct_IO		= linvfs_direct_IO,
+	.migratepage		= buffer_migrate_page,
 };
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index a36a8e3b703..bfb4f2917bb 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1521,6 +1521,7 @@ xfs_mapping_buftarg(
 	struct address_space	*mapping;
 	static struct address_space_operations mapping_aops = {
 		.sync_page = block_sync_page,
+		.migratepage = fail_migrate_page,
 	};
 
 	inode = new_inode(bdev->bd_inode->i_sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 84bb449b9b0..e059da94700 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -363,6 +363,8 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
+	/* migrate the contents of a page to the specified target */
+	int (*migratepage) (struct page *, struct page *);
 };
 
 struct backing_dev_info;
@@ -1719,6 +1721,12 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
 
+#ifdef CONFIG_MIGRATION
+extern int buffer_migrate_page(struct page *, struct page *);
+#else
+#define buffer_migrate_page NULL
+#endif
+
 extern int inode_change_ok(struct inode *, struct iattr *);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 229b6d04b4b..f3e17d5963c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -193,13 +193,18 @@ extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
 extern void migrate_page_copy(struct page *, struct page *);
+extern int migrate_page_remove_references(struct page *, struct page *, int);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
+extern int fail_migrate_page(struct page *, struct page *);
 #else
 static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
 static inline int migrate_pages(struct list_head *l, struct list_head *t,
 	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+/* Possible settings for the migrate_page() method in address_operations */
+#define migrate_page NULL
+#define fail_migrate_page NULL
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/rmap.c b/mm/rmap.c
index f4b91d7aa5c..df2c41c2a9a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -233,6 +233,7 @@ void remove_from_swap(struct page *page)
 
 	delete_from_swap_cache(page);
 }
+EXPORT_SYMBOL(remove_from_swap);
 #endif
 
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7b09ac503fe..db8a3d3e163 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,6 +27,7 @@ static struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.migratepage	= migrate_page,
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e98b86feb7..5a610804cd0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -614,6 +614,15 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -659,6 +668,7 @@ unlock_retry:
 retry:
 	return -EAGAIN;
 }
+EXPORT_SYMBOL(swap_page);
 
 /*
  * Page migration was first developed in the context of the memory hotplug
@@ -674,7 +684,7 @@ retry:
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
  */
-static int migrate_page_remove_references(struct page *newpage,
+int migrate_page_remove_references(struct page *newpage,
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -749,6 +759,7 @@ static int migrate_page_remove_references(struct page *newpage,
 
 	return 0;
 }
+EXPORT_SYMBOL(migrate_page_remove_references);
 
 /*
  * Copy the page to its new location
@@ -788,6 +799,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageWriteback(newpage))
 		end_page_writeback(newpage);
 }
+EXPORT_SYMBOL(migrate_page_copy);
 
 /*
  * Common logic to directly migrate a single page suitable for
@@ -815,6 +827,7 @@ int migrate_page(struct page *newpage, struct page *page)
 	remove_from_swap(newpage);
 	return 0;
 }
+EXPORT_SYMBOL(migrate_page);
 
 /*
  * migrate_pages
@@ -914,6 +927,11 @@ redo:
 		if (!mapping)
 			goto unlock_both;
 
+		if (mapping->a_ops->migratepage) {
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
 		/*
 		 * Trigger writeout if page is dirty
 		 */
-- 
cgit v1.2.3-70-g09d2


From 3dafccf22751429e69b6266636cf3acf45b48075 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 1 Feb 2006 03:05:42 -0800
Subject: [PATCH] slab: distinguish between object and buffer size

An object cache has two different object lengths:

  - the amount of memory available for the user (object size)
  - the amount of memory allocated internally (buffer size)

This patch does some renames to make the code reflect that better.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 154 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 80 insertions(+), 74 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 88082ae1573..1a014aaf449 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -375,7 +375,7 @@ struct kmem_cache {
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
-	unsigned int objsize;
+	unsigned int buffer_size;
 /* 2) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
 	unsigned int flags;	/* constant flags */
@@ -423,8 +423,14 @@ struct kmem_cache {
 	atomic_t freemiss;
 #endif
 #if DEBUG
-	int dbghead;
-	int reallen;
+	/*
+	 * If debugging is enabled, then the allocator can add additional
+	 * fields and/or padding to every object. buffer_size contains the total
+	 * object size including these internal fields, the following two
+	 * variables contain the offset to the user object and its size.
+	 */
+	int obj_offset;
+	int obj_size;
 #endif
 };
 
@@ -495,50 +501,50 @@ struct kmem_cache {
 
 /* memory layout of objects:
  * 0		: objp
- * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
+ * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
  * 		allocation. Catches writes behind the end of the allocation.
- * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
+ * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
  * 		redzone word.
- * cachep->dbghead: The real object.
- * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->obj_offset: The real object.
+ * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
  */
-static int obj_dbghead(kmem_cache_t *cachep)
+static int obj_offset(kmem_cache_t *cachep)
 {
-	return cachep->dbghead;
+	return cachep->obj_offset;
 }
 
-static int obj_reallen(kmem_cache_t *cachep)
+static int obj_size(kmem_cache_t *cachep)
 {
-	return cachep->reallen;
+	return cachep->obj_size;
 }
 
 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
-	return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
+	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 }
 
 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
-		return (unsigned long *)(objp + cachep->objsize -
+		return (unsigned long *)(objp + cachep->buffer_size -
 					 2 * BYTES_PER_WORD);
-	return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
+	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-	return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
+	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
 #else
 
-#define obj_dbghead(x)			0
-#define obj_reallen(cachep)		(cachep->objsize)
+#define obj_offset(x)			0
+#define obj_size(cachep)		(cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
@@ -623,12 +629,12 @@ static kmem_cache_t cache_cache = {
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
-	.objsize = sizeof(kmem_cache_t),
+	.buffer_size = sizeof(kmem_cache_t),
 	.flags = SLAB_NO_REAP,
 	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
-	.reallen = sizeof(kmem_cache_t),
+	.obj_size = sizeof(kmem_cache_t),
 #endif
 };
 
@@ -1057,9 +1063,9 @@ void __init kmem_cache_init(void)
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
-	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
 
-	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
+	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
 		       &left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
@@ -1274,9 +1280,9 @@ static void kmem_rcu_free(struct rcu_head *head)
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 			    unsigned long caller)
 {
-	int size = obj_reallen(cachep);
+	int size = obj_size(cachep);
 
-	addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
+	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
 
 	if (size < 5 * sizeof(unsigned long))
 		return;
@@ -1306,8 +1312,8 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
-	int size = obj_reallen(cachep);
-	addr = &((char *)addr)[obj_dbghead(cachep)];
+	int size = obj_size(cachep);
+	addr = &((char *)addr)[obj_offset(cachep)];
 
 	memset(addr, val, size);
 	*(unsigned char *)(addr + size - 1) = POISON_END;
@@ -1344,8 +1350,8 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 			     (unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
-	realobj = (char *)objp + obj_dbghead(cachep);
-	size = obj_reallen(cachep);
+	realobj = (char *)objp + obj_offset(cachep);
+	size = obj_size(cachep);
 	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
@@ -1361,8 +1367,8 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 	int size, i;
 	int lines = 0;
 
-	realobj = (char *)objp + obj_dbghead(cachep);
-	size = obj_reallen(cachep);
+	realobj = (char *)objp + obj_offset(cachep);
+	size = obj_size(cachep);
 
 	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
@@ -1398,17 +1404,17 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		struct slab *slabp = page_get_slab(virt_to_page(objp));
 		int objnr;
 
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		if (objnr) {
-			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
-			realobj = (char *)objp + obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
-			objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
-			realobj = (char *)objp + obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
@@ -1428,14 +1434,14 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 #if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->objsize * i;
+		void *objp = slabp->s_mem + cachep->buffer_size * i;
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->objsize % PAGE_SIZE) == 0
+			if ((cachep->buffer_size % PAGE_SIZE) == 0
 			    && OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
-						 cachep->objsize / PAGE_SIZE,
+						 cachep->buffer_size / PAGE_SIZE,
 						 1);
 			else
 				check_poison_obj(cachep, objp);
@@ -1452,13 +1458,13 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 					   "was overwritten");
 		}
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-			(cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
+			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
 	}
 #else
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void *objp = slabp->s_mem + cachep->objsize * i;
+			void *objp = slabp->s_mem + cachep->buffer_size * i;
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
@@ -1478,7 +1484,7 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose objsize is same
+/* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
 static inline void set_up_list3s(kmem_cache_t *cachep, int index)
 {
@@ -1611,7 +1617,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		set_fs(old_fs);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
-			       pc->objsize);
+			       pc->buffer_size);
 			continue;
 		}
 
@@ -1702,14 +1708,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	memset(cachep, 0, sizeof(kmem_cache_t));
 
 #if DEBUG
-	cachep->reallen = size;
+	cachep->obj_size = size;
 
 	if (flags & SLAB_RED_ZONE) {
 		/* redzoning only works with word aligned caches */
 		align = BYTES_PER_WORD;
 
 		/* add space for red zone words */
-		cachep->dbghead += BYTES_PER_WORD;
+		cachep->obj_offset += BYTES_PER_WORD;
 		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
@@ -1722,8 +1728,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-	    && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
-		cachep->dbghead += PAGE_SIZE - size;
+	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
+		cachep->obj_offset += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
@@ -1786,7 +1792,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
-	cachep->objsize = size;
+	cachep->buffer_size = size;
 
 	if (flags & CFLGS_OFF_SLAB)
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -2118,7 +2124,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->objsize * i;
+		void *objp = slabp->s_mem + cachep->buffer_size * i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2136,7 +2142,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 		 * Otherwise, deadlock. They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(objp + obj_dbghead(cachep), cachep,
+			cachep->ctor(objp + obj_offset(cachep), cachep,
 				     ctor_flags);
 
 		if (cachep->flags & SLAB_RED_ZONE) {
@@ -2147,10 +2153,10 @@ static void cache_init_objs(kmem_cache_t *cachep,
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
 		    && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
-					 cachep->objsize / PAGE_SIZE, 0);
+					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
@@ -2309,7 +2315,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	unsigned int objnr;
 	struct slab *slabp;
 
-	objp -= obj_dbghead(cachep);
+	objp -= obj_offset(cachep);
 	kfree_debugcheck(objp);
 	page = virt_to_page(objp);
 
@@ -2341,31 +2347,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
+	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
+	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
 		 * caller can perform a verify of its state (debugging).
 		 * Called without the cache-lock held.
 		 */
-		cachep->ctor(objp + obj_dbghead(cachep),
+		cachep->ctor(objp + obj_offset(cachep),
 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
 	}
 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
 		/* we want to cache poison the object,
 		 * call the destruction callback
 		 */
-		cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
+		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
 			kernel_map_pages(virt_to_page(objp),
-					 cachep->objsize / PAGE_SIZE, 0);
+					 cachep->buffer_size / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
@@ -2468,7 +2474,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 
 			/* get obj pointer */
 			ac->entry[ac->avail++] = slabp->s_mem +
-			    slabp->free * cachep->objsize;
+			    slabp->free * cachep->buffer_size;
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
@@ -2526,9 +2532,9 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
 			kernel_map_pages(virt_to_page(objp),
-					 cachep->objsize / PAGE_SIZE, 1);
+					 cachep->buffer_size / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
@@ -2553,7 +2559,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
-	objp += obj_dbghead(cachep);
+	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 
@@ -2648,7 +2654,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	BUG_ON(slabp->inuse == cachep->num);
 
 	/* get obj pointer */
-	obj = slabp->s_mem + slabp->free * cachep->objsize;
+	obj = slabp->s_mem + slabp->free * cachep->buffer_size;
 	slabp->inuse++;
 	next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
@@ -2699,7 +2705,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 		slabp = page_get_slab(virt_to_page(objp));
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 
@@ -2881,7 +2887,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
 	unsigned long align_mask = BYTES_PER_WORD - 1;
-	unsigned long size = cachep->objsize;
+	unsigned long size = cachep->buffer_size;
 	struct page *page;
 
 	if (unlikely(addr < min_addr))
@@ -3083,7 +3089,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = page_get_cache(virt_to_page(objp));
-	mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
+	mutex_debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
@@ -3114,7 +3120,7 @@ EXPORT_SYMBOL(free_percpu);
 
 unsigned int kmem_cache_size(kmem_cache_t *cachep)
 {
-	return obj_reallen(cachep);
+	return obj_size(cachep);
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
@@ -3258,13 +3264,13 @@ static void enable_cpucache(kmem_cache_t *cachep)
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
 	 */
-	if (cachep->objsize > 131072)
+	if (cachep->buffer_size > 131072)
 		limit = 1;
-	else if (cachep->objsize > PAGE_SIZE)
+	else if (cachep->buffer_size > PAGE_SIZE)
 		limit = 8;
-	else if (cachep->objsize > 1024)
+	else if (cachep->buffer_size > 1024)
 		limit = 24;
-	else if (cachep->objsize > 256)
+	else if (cachep->buffer_size > 256)
 		limit = 54;
 	else
 		limit = 120;
@@ -3279,7 +3285,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
 	 */
 	shared = 0;
 #ifdef CONFIG_SMP
-	if (cachep->objsize <= PAGE_SIZE)
+	if (cachep->buffer_size <= PAGE_SIZE)
 		shared = 8;
 #endif
 
@@ -3528,7 +3534,7 @@ static int s_show(struct seq_file *m, void *p)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-		   name, active_objs, num_objs, cachep->objsize,
+		   name, active_objs, num_objs, cachep->buffer_size,
 		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
 		   cachep->limit, cachep->batchcount, cachep->shared);
@@ -3656,5 +3662,5 @@ unsigned int ksize(const void *objp)
 	if (unlikely(objp == NULL))
 		return 0;
 
-	return obj_reallen(page_get_cache(virt_to_page(objp)));
+	return obj_size(page_get_cache(virt_to_page(objp)));
 }
-- 
cgit v1.2.3-70-g09d2


From 18f820f655ce93b1e4d9b48fc6fcafc64157c6bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:43 -0800
Subject: [PATCH] slab: minor cleanup to kmem_cache_alloc_node

Clean up kmem_cache_alloc_node a bit.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 1a014aaf449..bb7a9837b94 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2928,27 +2928,18 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	unsigned long save_flags;
 	void *ptr;
 
-	if (nodeid == -1)
-		return __cache_alloc(cachep, flags);
-
-	if (unlikely(!cachep->nodelists[nodeid])) {
-		/* Fall back to __cache_alloc if we run into trouble */
-		printk(KERN_WARNING
-		       "slab: not allocating in inactive node %d for cache %s\n",
-		       nodeid, cachep->name);
-		return __cache_alloc(cachep, flags);
-	}
-
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
-	if (nodeid == numa_node_id())
+
+	if (nodeid == -1 || nodeid == numa_node_id() ||
+	    !cachep->nodelists[nodeid])
 		ptr = ____cache_alloc(cachep, flags);
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
 	local_irq_restore(save_flags);
-	ptr =
-	    cache_alloc_debugcheck_after(cachep, flags, ptr,
-					 __builtin_return_address(0));
+
+	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
+					   __builtin_return_address(0));
 
 	return ptr;
 }
-- 
cgit v1.2.3-70-g09d2


From 5ec8a847bb8ae2ba6395cfb7cb4bfdc78ada82ed Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 1 Feb 2006 03:05:44 -0800
Subject: [PATCH] slab: have index_of bug at compile time

I noticed the code for index_of is a creative way of finding the cache
index using the compiler to optimize to a single hard coded number.  But
I couldn't help noticing that it uses two methods to let you know that
someone used it wrong.  One is at compile time (the correct way), and
the other is at run time (not good).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index bb7a9837b94..613d385519f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -316,6 +316,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
  */
 static __always_inline int index_of(const size_t size)
 {
+	extern void __bad_size(void);
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -326,12 +328,9 @@ static __always_inline int index_of(const size_t size)
 		i++;
 #include "linux/kmalloc_sizes.h"
 #undef CACHE
-		{
-			extern void __bad_size(void);
-			__bad_size();
-		}
+		__bad_size();
 	} else
-		BUG();
+		__bad_size();
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fbaccacff1f17c65ae0972085368a7ec75be6062 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 1 Feb 2006 03:05:45 -0800
Subject: [PATCH] slab: cache_estimate cleanup

Clean up cache_estimate() in mm/slab.c and improves the algorithm from O(n) to
O(1).  We first calculate the maximum number of objects a slab can hold after
struct slab and kmem_bufctl_t for each object has been given enough space.
After that, to respect alignment rules, we decrease the number of objects if
necessary.  As required padding is at most align-1 and memory of obj_size is
at least align, it is always enough to decrease number of objects by one.

The optimization was originally made by Balbir Singh with more improvements
from Steven Rostedt.  Manfred Spraul provider further modifications: no loop
at all for the off-slab case and added comments to explain the background.

Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 613d385519f..e869400ea73 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -702,32 +702,69 @@ kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 }
 EXPORT_SYMBOL(kmem_find_general_cachep);
 
-/* Cal the num objs, wastage, and bytes left over for a given slab size. */
-static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-			   int flags, size_t *left_over, unsigned int *num)
+static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
-	int i;
-	size_t wastage = PAGE_SIZE << gfporder;
-	size_t extra = 0;
-	size_t base = 0;
+	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+}
 
-	if (!(flags & CFLGS_OFF_SLAB)) {
-		base = sizeof(struct slab);
-		extra = sizeof(kmem_bufctl_t);
-	}
-	i = 0;
-	while (i * size + ALIGN(base + i * extra, align) <= wastage)
-		i++;
-	if (i > 0)
-		i--;
+/* Calculate the number of objects and left-over bytes for a given
+   buffer size. */
+static void cache_estimate(unsigned long gfporder, size_t buffer_size,
+			   size_t align, int flags, size_t *left_over,
+			   unsigned int *num)
+{
+	int nr_objs;
+	size_t mgmt_size;
+	size_t slab_size = PAGE_SIZE << gfporder;
 
-	if (i > SLAB_LIMIT)
-		i = SLAB_LIMIT;
+	/*
+	 * The slab management structure can be either off the slab or
+	 * on it. For the latter case, the memory allocated for a
+	 * slab is used for:
+	 *
+	 * - The struct slab
+	 * - One kmem_bufctl_t for each object
+	 * - Padding to respect alignment of @align
+	 * - @buffer_size bytes for each object
+	 *
+	 * If the slab management structure is off the slab, then the
+	 * alignment will already be calculated into the size. Because
+	 * the slabs are all pages aligned, the objects will be at the
+	 * correct alignment when allocated.
+	 */
+	if (flags & CFLGS_OFF_SLAB) {
+		mgmt_size = 0;
+		nr_objs = slab_size / buffer_size;
 
-	*num = i;
-	wastage -= i * size;
-	wastage -= ALIGN(base + i * extra, align);
-	*left_over = wastage;
+		if (nr_objs > SLAB_LIMIT)
+			nr_objs = SLAB_LIMIT;
+	} else {
+		/*
+		 * Ignore padding for the initial guess. The padding
+		 * is at most @align-1 bytes, and @buffer_size is at
+		 * least @align. In the worst case, this result will
+		 * be one greater than the number of objects that fit
+		 * into the memory allocation when taking the padding
+		 * into account.
+		 */
+		nr_objs = (slab_size - sizeof(struct slab)) /
+			  (buffer_size + sizeof(kmem_bufctl_t));
+
+		/*
+		 * This calculated number will be either the right
+		 * amount, or one greater than what we want.
+		 */
+		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+		       > slab_size)
+			nr_objs--;
+
+		if (nr_objs > SLAB_LIMIT)
+			nr_objs = SLAB_LIMIT;
+
+		mgmt_size = slab_mgmt_size(nr_objs, align);
+	}
+	*num = nr_objs;
+	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
-- 
cgit v1.2.3-70-g09d2


From 12dd36faec5d3bd96da84fa8f76efecc632930ab Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Wed, 1 Feb 2006 03:05:46 -0800
Subject: [PATCH] slab: extract slab_destroy_objs()

Create a helper function, slab_destroy_objs() which called from
slab_destroy().  This makes slab_destroy() smaller and more readable, and
moves ifdefs outside the function body.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index e869400ea73..85adf099201 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1459,15 +1459,13 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 }
 #endif
 
-/* Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+#if DEBUG
+/**
+ * slab_destroy_objs - call the registered destructor for each object in
+ *      a slab that is to be destroyed.
  */
-static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
 {
-	void *addr = slabp->s_mem - slabp->colouroff;
-
-#if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = slabp->s_mem + cachep->buffer_size * i;
@@ -1496,7 +1494,10 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
 			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
 	}
+}
 #else
+static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
+{
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
@@ -1504,8 +1505,19 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
+}
 #endif
 
+/**
+ * Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache.
+ * The cache-lock is not held/needed.
+ */
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
+{
+	void *addr = slabp->s_mem - slabp->colouroff;
+
+	slab_destroy_objs(cachep, slabp);
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slab_rcu *slab_rcu;
 
-- 
cgit v1.2.3-70-g09d2


From 78d382d77c84229d031431931bf6490d5da6ab86 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Wed, 1 Feb 2006 03:05:47 -0800
Subject: [PATCH] slab: extract slab_{put|get}_obj

Create two helper functions slab_get_obj() and slab_put_obj() to replace
duplicated code in mm/slab.c

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 77 +++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 85adf099201..594a9155c7d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2226,6 +2226,42 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
 	}
 }
 
+static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid)
+{
+	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+	kmem_bufctl_t next;
+
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+	WARN_ON(slabp->nodeid != nodeid);
+#endif
+	slabp->free = next;
+
+	return objp;
+}
+
+static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp,
+			  int nodeid)
+{
+	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+
+#if DEBUG
+	/* Verify that the slab belongs to the intended node */
+	WARN_ON(slabp->nodeid != nodeid);
+
+	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+		printk(KERN_ERR "slab: double free detected in cache "
+		       "'%s', objp %p\n", cachep->name, objp);
+		BUG();
+	}
+#endif
+	slab_bufctl(slabp)[objnr] = slabp->free;
+	slabp->free = objnr;
+	slabp->inuse--;
+}
+
 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
 {
 	int i;
@@ -2515,22 +2551,12 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 		check_slabp(cachep, slabp);
 		check_spinlock_acquired(cachep);
 		while (slabp->inuse < cachep->num && batchcount--) {
-			kmem_bufctl_t next;
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
-			/* get obj pointer */
-			ac->entry[ac->avail++] = slabp->s_mem +
-			    slabp->free * cachep->buffer_size;
-
-			slabp->inuse++;
-			next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-			WARN_ON(numa_node_id() != slabp->nodeid);
-#endif
-			slabp->free = next;
+			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+							    numa_node_id());
 		}
 		check_slabp(cachep, slabp);
 
@@ -2675,7 +2701,6 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	struct slab *slabp;
 	struct kmem_list3 *l3;
 	void *obj;
-	kmem_bufctl_t next;
 	int x;
 
 	l3 = cachep->nodelists[nodeid];
@@ -2701,14 +2726,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 
 	BUG_ON(slabp->inuse == cachep->num);
 
-	/* get obj pointer */
-	obj = slabp->s_mem + slabp->free * cachep->buffer_size;
-	slabp->inuse++;
-	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
-	slabp->free = next;
+	obj = slab_get_obj(cachep, slabp, nodeid);
 	check_slabp(cachep, slabp);
 	l3->free_objects--;
 	/* move slabp to correct slabp list: */
@@ -2748,29 +2766,14 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 	for (i = 0; i < nr_objects; i++) {
 		void *objp = objpp[i];
 		struct slab *slabp;
-		unsigned int objnr;
 
 		slabp = page_get_slab(virt_to_page(objp));
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
-
-#if DEBUG
-		/* Verify that the slab belongs to the intended node */
-		WARN_ON(slabp->nodeid != node);
-
-		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
-			printk(KERN_ERR "slab: double free detected in cache "
-			       "'%s', objp %p\n", cachep->name, objp);
-			BUG();
-		}
-#endif
-		slab_bufctl(slabp)[objnr] = slabp->free;
-		slabp->free = objnr;
+		slab_put_obj(cachep, slabp, objp, node);
 		STATS_DEC_ACTIVE(cachep);
-		slabp->inuse--;
 		l3->free_objects++;
 		check_slabp(cachep, slabp);
 
-- 
cgit v1.2.3-70-g09d2


From 5295a74cc0bcf1291686eb734ccb06baa3d55c1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:48 -0800
Subject: [PATCH] slab: reduce inlining

From: Manfred Spraul <manfred@colorfullife.com>

Reduce the amount of inline functions in slab to the functions that
are used in the hot path:

  - no inline for debug functions
  - no __always_inline, inline is already __always_inline
  - remove inline from a few numa support functions.

Before:

   text    data     bss     dec     hex filename
  13588     752      48   14388    3834 mm/slab.o (defconfig)
  16671    2492      48   19211    4b0b mm/slab.o (numa)

After:

   text    data     bss     dec     hex filename
  13366     752      48   14166    3756 mm/slab.o (defconfig)
  16230    2492      48   18770    4952 mm/slab.o (numa)

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 594a9155c7d..ba288b3877d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -337,7 +337,7 @@ static __always_inline int index_of(const size_t size)
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
 
-static inline void kmem_list3_init(struct kmem_list3 *parent)
+static void kmem_list3_init(struct kmem_list3 *parent)
 {
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
@@ -818,7 +818,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
 
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * MAX_NUMNODES;
@@ -845,7 +845,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
 	return ac_ptr;
 }
 
-static inline void free_alien_cache(struct array_cache **ac_ptr)
+static void free_alien_cache(struct array_cache **ac_ptr)
 {
 	int i;
 
@@ -858,8 +858,8 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 	kfree(ac_ptr);
 }
 
-static inline void __drain_alien_cache(kmem_cache_t *cachep,
-				       struct array_cache *ac, int node)
+static void __drain_alien_cache(kmem_cache_t *cachep,
+				struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
@@ -1534,7 +1534,7 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 
 /* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
-static inline void set_up_list3s(kmem_cache_t *cachep, int index)
+static void set_up_list3s(kmem_cache_t *cachep, int index)
 {
 	int node;
 
@@ -1937,7 +1937,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
 #endif
 }
 
-static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+static void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-- 
cgit v1.2.3-70-g09d2


From 6ed5eb2211204224799b2821656bbbfde26ef200 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:49 -0800
Subject: [PATCH] slab: extract virt_to_{cache|slab}

Introduce virt_to_cache() and virt_to_slab() functions to reduce duplicate
code and introduce a proper abstraction should we want to support other kind
of mapping for address to slab and cache (eg.  for vmalloc() or I/O memory).

Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ba288b3877d..c2f9e0a330f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -596,6 +596,18 @@ static inline struct slab *page_get_slab(struct page *page)
 	return (struct slab *)page->lru.prev;
 }
 
+static inline struct kmem_cache *virt_to_cache(const void *obj)
+{
+	struct page *page = virt_to_page(obj);
+	return page_get_cache(page);
+}
+
+static inline struct slab *virt_to_slab(const void *obj)
+{
+	struct page *page = virt_to_page(obj);
+	return page_get_slab(page);
+}
+
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
@@ -1437,7 +1449,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		/* Print some data about the neighboring objects, if they
 		 * exist:
 		 */
-		struct slab *slabp = page_get_slab(virt_to_page(objp));
+		struct slab *slabp = virt_to_slab(objp);
 		int objnr;
 
 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
@@ -2767,7 +2779,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 		void *objp = objpp[i];
 		struct slab *slabp;
 
-		slabp = page_get_slab(virt_to_page(objp));
+		slabp = virt_to_slab(objp);
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
 		check_spinlock_acquired_node(cachep, node);
@@ -2867,7 +2879,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 #ifdef CONFIG_NUMA
 	{
 		struct slab *slabp;
-		slabp = page_get_slab(virt_to_page(objp));
+		slabp = virt_to_slab(objp);
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
@@ -3130,7 +3142,7 @@ void kfree(const void *objp)
 		return;
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
-	c = page_get_cache(virt_to_page(objp));
+	c = virt_to_cache(objp);
 	mutex_debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
@@ -3704,5 +3716,5 @@ unsigned int ksize(const void *objp)
 	if (unlikely(objp == NULL))
 		return 0;
 
-	return obj_size(page_get_cache(virt_to_page(objp)));
+	return obj_size(virt_to_cache(objp));
 }
-- 
cgit v1.2.3-70-g09d2


From 9a2dba4b4912b493070cbc170629fdbf440b01d7 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:49 -0800
Subject: [PATCH] slab: rename ac_data to cpu_cache_get

Rename the ac_data() function to more descriptive cpu_cache_get().

Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index c2f9e0a330f..b1909386499 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -679,7 +679,7 @@ static void enable_cpucache(kmem_cache_t *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+static inline struct array_cache *cpu_cache_get(kmem_cache_t *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
@@ -1186,8 +1186,8 @@ void __init kmem_cache_init(void)
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache),
+		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
@@ -1195,9 +1195,9 @@ void __init kmem_cache_init(void)
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
+		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
+		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
 		       sizeof(struct arraycache_init));
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
@@ -1235,7 +1235,7 @@ void __init kmem_cache_init(void)
 	g_cpucache_up = FULL;
 
 	/* Register a cpu startup notifier callback
-	 * that initializes ac_data for all new cpus
+	 * that initializes cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
@@ -1909,11 +1909,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		    jiffies + REAPTIMEOUT_LIST3 +
 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
-		BUG_ON(!ac_data(cachep));
-		ac_data(cachep)->avail = 0;
-		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		ac_data(cachep)->batchcount = 1;
-		ac_data(cachep)->touched = 0;
+		BUG_ON(!cpu_cache_get(cachep));
+		cpu_cache_get(cachep)->avail = 0;
+		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+		cpu_cache_get(cachep)->batchcount = 1;
+		cpu_cache_get(cachep)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
 	}
@@ -1992,7 +1992,7 @@ static void do_drain(void *arg)
 	int node = numa_node_id();
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep);
 	spin_lock(&cachep->nodelists[node]->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
@@ -2518,7 +2518,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep);
       retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2590,7 +2590,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 		x = cache_grow(cachep, flags, numa_node_id());
 
 		// cache_grow can reenable interrupts, then ac could change.
-		ac = ac_data(cachep);
+		ac = cpu_cache_get(cachep);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
@@ -2675,7 +2675,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 #endif
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
@@ -2868,7 +2868,7 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
  */
 static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 {
-	struct array_cache *ac = ac_data(cachep);
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -3253,7 +3253,7 @@ static void do_ccupdate_local(void *info)
 	struct array_cache *old;
 
 	check_irq_off();
-	old = ac_data(new->cachep);
+	old = cpu_cache_get(new->cachep);
 
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
@@ -3419,7 +3419,7 @@ static void cache_reap(void *unused)
 			drain_alien_cache(searchp, l3);
 		spin_lock_irq(&l3->list_lock);
 
-		drain_array_locked(searchp, ac_data(searchp), 0,
+		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
 				   numa_node_id());
 
 		if (time_after(l3->next_reap, jiffies))
-- 
cgit v1.2.3-70-g09d2


From 343e0d7a93951e35065fdb5e3dd61aece0ec6b3c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:50 -0800
Subject: [PATCH] slab: replace kmem_cache_t with struct kmem_cache

Replace uses of kmem_cache_t with proper struct kmem_cache in mm/slab.c.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 195 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 98 insertions(+), 97 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index b1909386499..6fbd6a1cdeb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -55,7 +55,7 @@
  *
  * SMP synchronization:
  *  constructors and destructors are called without any locking.
- *  Several members in kmem_cache_t and struct slab never change, they
+ *  Several members in struct kmem_cache and struct slab never change, they
  *	are accessed without any locking.
  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  *  	and local interrupts are disabled so slab code is preempt-safe.
@@ -244,7 +244,7 @@ struct slab {
  */
 struct slab_rcu {
 	struct rcu_head head;
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	void *addr;
 };
 
@@ -363,7 +363,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	} while (0)
 
 /*
- * kmem_cache_t
+ * struct kmem_cache
  *
  * manages a cache.
  */
@@ -391,15 +391,15 @@ struct kmem_cache {
 	size_t colour;		/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
 	unsigned int colour_next;	/* cache colouring */
-	kmem_cache_t *slabp_cache;
+	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
 	unsigned int dflags;	/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor) (void *, kmem_cache_t *, unsigned long);
+	void (*ctor) (void *, struct kmem_cache *, unsigned long);
 
 	/* de-constructor func */
-	void (*dtor) (void *, kmem_cache_t *, unsigned long);
+	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 
 /* 4) cache creation/removal */
 	const char *name;
@@ -509,23 +509,23 @@ struct kmem_cache {
  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
  */
-static int obj_offset(kmem_cache_t *cachep)
+static int obj_offset(struct kmem_cache *cachep)
 {
 	return cachep->obj_offset;
 }
 
-static int obj_size(kmem_cache_t *cachep)
+static int obj_size(struct kmem_cache *cachep)
 {
 	return cachep->obj_size;
 }
 
-static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
+static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 }
 
-static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
+static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
@@ -534,7 +534,7 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
-static void **dbg_userword(kmem_cache_t *cachep, void *objp)
+static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
@@ -636,16 +636,16 @@ static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
-static kmem_cache_t cache_cache = {
+static struct kmem_cache cache_cache = {
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
-	.buffer_size = sizeof(kmem_cache_t),
+	.buffer_size = sizeof(struct kmem_cache),
 	.flags = SLAB_NO_REAP,
 	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
-	.obj_size = sizeof(kmem_cache_t),
+	.obj_size = sizeof(struct kmem_cache),
 #endif
 };
 
@@ -674,17 +674,17 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
-static void enable_cpucache(kmem_cache_t *cachep);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
-static int __node_shrink(kmem_cache_t *cachep, int node);
+static int __node_shrink(struct kmem_cache *cachep, int node);
 
-static inline struct array_cache *cpu_cache_get(kmem_cache_t *cachep)
+static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
 
-static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -708,7 +708,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 	return csizep->cs_cachep;
 }
 
-kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
@@ -781,7 +781,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 
-static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
@@ -828,7 +828,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 
 #ifdef CONFIG_NUMA
-static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -870,7 +870,7 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 	kfree(ac_ptr);
 }
 
-static void __drain_alien_cache(kmem_cache_t *cachep,
+static void __drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -883,7 +883,7 @@ static void __drain_alien_cache(kmem_cache_t *cachep,
 	}
 }
 
-static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
 {
 	int i = 0;
 	struct array_cache *ac;
@@ -908,7 +908,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
@@ -1046,7 +1046,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1086,14 +1086,14 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
+	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
 	 *    structures of all caches, except cache_cache itself: cache_cache
 	 *    is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
-	 *    The kmem_cache_t for the new cache is allocated normally.
+	 *    The struct kmem_cache for the new cache is allocated normally.
 	 *    An __init data area is used for the head array.
 	 * 3) Create the remaining kmalloc caches, with minimally sized
 	 *    head arrays.
@@ -1224,7 +1224,7 @@ void __init kmem_cache_init(void)
 
 	/* 6) resize the head arrays to their final sizes */
 	{
-		kmem_cache_t *cachep;
+		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
 		    enable_cpucache(cachep);
@@ -1267,7 +1267,7 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct page *page;
 	void *addr;
@@ -1293,7 +1293,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 /*
  * Interface to system's page release.
  */
-static void kmem_freepages(kmem_cache_t *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 {
 	unsigned long i = (1 << cachep->gfporder);
 	struct page *page = virt_to_page(addr);
@@ -1315,7 +1315,7 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 static void kmem_rcu_free(struct rcu_head *head)
 {
 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
-	kmem_cache_t *cachep = slab_rcu->cachep;
+	struct kmem_cache *cachep = slab_rcu->cachep;
 
 	kmem_freepages(cachep, slab_rcu->addr);
 	if (OFF_SLAB(cachep))
@@ -1325,7 +1325,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 #if DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
+static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 			    unsigned long caller)
 {
 	int size = obj_size(cachep);
@@ -1358,7 +1358,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 }
 #endif
 
-static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
+static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
 	int size = obj_size(cachep);
 	addr = &((char *)addr)[obj_offset(cachep)];
@@ -1380,7 +1380,7 @@ static void dump_line(char *data, int offset, int limit)
 
 #if DEBUG
 
-static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
+static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 {
 	int i, size;
 	char *realobj;
@@ -1409,7 +1409,7 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 	}
 }
 
-static void check_poison_obj(kmem_cache_t *cachep, void *objp)
+static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 {
 	char *realobj;
 	int size, i;
@@ -1476,7 +1476,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
  * slab_destroy_objs - call the registered destructor for each object in
  *      a slab that is to be destroyed.
  */
-static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1508,7 +1508,7 @@ static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
 	}
 }
 #else
-static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	if (cachep->dtor) {
 		int i;
@@ -1525,7 +1525,7 @@ static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
  * Before calling the slab must have been unlinked from the cache.
  * The cache-lock is not held/needed.
  */
-static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1546,7 +1546,7 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 
 /* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
-static void set_up_list3s(kmem_cache_t *cachep, int index)
+static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
 
@@ -1566,7 +1566,7 @@ static void set_up_list3s(kmem_cache_t *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+static inline size_t calculate_slab_order(struct kmem_cache *cachep, size_t size,
 					  size_t align, gfp_t flags)
 {
 	size_t left_over = 0;
@@ -1638,13 +1638,13 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-kmem_cache_t *
+struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
-	void (*dtor)(void*, kmem_cache_t *, unsigned long))
+	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
-	kmem_cache_t *cachep = NULL;
+	struct kmem_cache *cachep = NULL;
 	struct list_head *p;
 
 	/*
@@ -1662,7 +1662,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each(p, &cache_chain) {
-		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+		struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
 		mm_segment_t old_fs = get_fs();
 		char tmp;
 		int res;
@@ -1762,10 +1762,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	align = ralign;
 
 	/* Get cache's description obj. */
-	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+	cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto oops;
-	memset(cachep, 0, sizeof(kmem_cache_t));
+	memset(cachep, 0, sizeof(struct kmem_cache));
 
 #if DEBUG
 	cachep->obj_size = size;
@@ -1941,7 +1941,7 @@ static void check_irq_on(void)
 	BUG_ON(irqs_disabled());
 }
 
-static void check_spinlock_acquired(kmem_cache_t *cachep)
+static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
@@ -1949,7 +1949,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
 #endif
 }
 
-static void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
@@ -1982,12 +1982,12 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 	preempt_enable();
 }
 
-static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
 				int force, int node);
 
 static void do_drain(void *arg)
 {
-	kmem_cache_t *cachep = (kmem_cache_t *) arg;
+	struct kmem_cache *cachep = (struct kmem_cache *) arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -1999,7 +1999,7 @@ static void do_drain(void *arg)
 	ac->avail = 0;
 }
 
-static void drain_cpu_caches(kmem_cache_t *cachep)
+static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
@@ -2020,7 +2020,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
 	spin_unlock_irq(&cachep->spinlock);
 }
 
-static int __node_shrink(kmem_cache_t *cachep, int node)
+static int __node_shrink(struct kmem_cache *cachep, int node)
 {
 	struct slab *slabp;
 	struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -2049,7 +2049,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
 	return ret;
 }
 
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0, i = 0;
 	struct kmem_list3 *l3;
@@ -2075,7 +2075,7 @@ static int __cache_shrink(kmem_cache_t *cachep)
  * Releases as many slabs as possible for a cache.
  * To help debugging, a zero exit status indicates all slabs were released.
  */
-int kmem_cache_shrink(kmem_cache_t *cachep)
+int kmem_cache_shrink(struct kmem_cache *cachep)
 {
 	if (!cachep || in_interrupt())
 		BUG();
@@ -2088,7 +2088,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * kmem_cache_destroy - delete a cache
  * @cachep: the cache to destroy
  *
- * Remove a kmem_cache_t object from the slab cache.
+ * Remove a struct kmem_cache object from the slab cache.
  * Returns 0 on success.
  *
  * It is expected this function will be called by a module when it is
@@ -2101,7 +2101,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * The caller must guarantee that noone will allocate memory from the cache
  * during the kmem_cache_destroy().
  */
-int kmem_cache_destroy(kmem_cache_t *cachep)
+int kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2152,7 +2152,7 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 				   int colour_off, gfp_t local_flags)
 {
 	struct slab *slabp;
@@ -2178,7 +2178,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 	return (kmem_bufctl_t *) (slabp + 1);
 }
 
-static void cache_init_objs(kmem_cache_t *cachep,
+static void cache_init_objs(struct kmem_cache *cachep,
 			    struct slab *slabp, unsigned long ctor_flags)
 {
 	int i;
@@ -2227,7 +2227,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 	slabp->free = 0;
 }
 
-static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
+static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (flags & SLAB_DMA) {
 		if (!(cachep->gfpflags & GFP_DMA))
@@ -2238,7 +2238,7 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
 	}
 }
 
-static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
 {
 	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
 	kmem_bufctl_t next;
@@ -2254,7 +2254,7 @@ static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid)
 	return objp;
 }
 
-static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp,
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
 			  int nodeid)
 {
 	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
@@ -2274,7 +2274,7 @@ static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp,
 	slabp->inuse--;
 }
 
-static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
 {
 	int i;
 	struct page *page;
@@ -2293,7 +2293,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct slab *slabp;
 	void *objp;
@@ -2404,7 +2404,7 @@ static void kfree_debugcheck(const void *objp)
 	}
 }
 
-static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
+static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 				   void *caller)
 {
 	struct page *page;
@@ -2478,7 +2478,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	return objp;
 }
 
-static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
+static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 {
 	kmem_bufctl_t i;
 	int entries = 0;
@@ -2511,7 +2511,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2602,7 +2602,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 }
 
 static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
+cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2611,7 +2611,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
 					void *objp, void *caller)
 {
 	if (!objp)
@@ -2660,7 +2660,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
 	struct array_cache *ac;
@@ -2687,7 +2687,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 	return objp;
 }
 
-static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+static inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2707,7 +2707,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -2769,7 +2769,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 /*
  * Caller needs to acquire correct kmem_list's list_lock
  */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		       int node)
 {
 	int i;
@@ -2807,7 +2807,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 	}
 }
 
-static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
+static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2866,7 +2866,7 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
  *
  * Called with disabled ints.
  */
-static inline void __cache_free(kmem_cache_t *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
@@ -2925,7 +2925,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
-void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	return __cache_alloc(cachep, flags);
 }
@@ -2945,7 +2945,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
  *
  * Currently only used for dentry validation.
  */
-int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
+int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
 {
 	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
@@ -2986,7 +2986,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
  * New and improved: it will now make sure that the object gets
  * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	unsigned long save_flags;
 	void *ptr;
@@ -3010,7 +3010,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
@@ -3043,7 +3043,7 @@ EXPORT_SYMBOL(kmalloc_node);
  */
 void *__kmalloc(size_t size, gfp_t flags)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	/* If you want to save a few bytes .text space: replace
 	 * __ with kmem_.
@@ -3114,7 +3114,7 @@ EXPORT_SYMBOL(__alloc_percpu);
  * Free an object which was previously allocated from this
  * cache.
  */
-void kmem_cache_free(kmem_cache_t *cachep, void *objp)
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
 
@@ -3135,7 +3135,7 @@ EXPORT_SYMBOL(kmem_cache_free);
  */
 void kfree(const void *objp)
 {
-	kmem_cache_t *c;
+	struct kmem_cache *c;
 	unsigned long flags;
 
 	if (unlikely(!objp))
@@ -3172,13 +3172,13 @@ void free_percpu(const void *objp)
 EXPORT_SYMBOL(free_percpu);
 #endif
 
-unsigned int kmem_cache_size(kmem_cache_t *cachep)
+unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
 	return obj_size(cachep);
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
-const char *kmem_cache_name(kmem_cache_t *cachep)
+const char *kmem_cache_name(struct kmem_cache *cachep)
 {
 	return cachep->name;
 }
@@ -3187,7 +3187,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 for all nodes.
  */
-static int alloc_kmemlist(kmem_cache_t *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3243,7 +3243,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
 }
 
 struct ccupdate_struct {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	struct array_cache *new[NR_CPUS];
 };
 
@@ -3259,7 +3259,7 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
 			    int shared)
 {
 	struct ccupdate_struct new;
@@ -3305,7 +3305,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
 	return 0;
 }
 
-static void enable_cpucache(kmem_cache_t *cachep)
+static void enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
 	int limit, shared;
@@ -3357,7 +3357,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
 				int force, int node)
 {
 	int tofree;
@@ -3402,12 +3402,12 @@ static void cache_reap(void *unused)
 	}
 
 	list_for_each(walk, &cache_chain) {
-		kmem_cache_t *searchp;
+		struct kmem_cache *searchp;
 		struct list_head *p;
 		int tofree;
 		struct slab *slabp;
 
-		searchp = list_entry(walk, kmem_cache_t, next);
+		searchp = list_entry(walk, struct kmem_cache, next);
 
 		if (searchp->flags & SLAB_NO_REAP)
 			goto next;
@@ -3510,15 +3510,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		if (p == &cache_chain)
 			return NULL;
 	}
-	return list_entry(p, kmem_cache_t, next);
+	return list_entry(p, struct kmem_cache, next);
 }
 
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	kmem_cache_t *cachep = p;
+	struct kmem_cache *cachep = p;
 	++*pos;
 	return cachep->next.next == &cache_chain ? NULL
-	    : list_entry(cachep->next.next, kmem_cache_t, next);
+	    : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3528,7 +3528,7 @@ static void s_stop(struct seq_file *m, void *p)
 
 static int s_show(struct seq_file *m, void *p)
 {
-	kmem_cache_t *cachep = p;
+	struct kmem_cache *cachep = p;
 	struct list_head *q;
 	struct slab *slabp;
 	unsigned long active_objs;
@@ -3678,7 +3678,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
 	list_for_each(p, &cache_chain) {
-		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
+		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+						       next);
 
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 ||
-- 
cgit v1.2.3-70-g09d2


From b958f7d9f35bfb61625f201cd92a3fc39504af7a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 1 Feb 2006 03:05:51 -0800
Subject: [PATCH] dump_stack() in oom handler

Sometimes it's nice to know who's calling.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 14bd4ec7959..b05ab8f2a56 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -271,6 +271,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
 			gfp_mask, order);
+		dump_stack();
 		show_mem();
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 7fd6b1413082c303613fc137aca9a004740cacf0 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:52 -0800
Subject: [PATCH] slab: fix kzalloc and kstrdup caller report for
 CONFIG_DEBUG_SLAB

Fix kzalloc() and kstrdup() caller report for CONFIG_DEBUG_SLAB.  We must
pass the caller to __cache_alloc() instead of directly doing
__builtin_return_address(0) there; otherwise kzalloc() and kstrdup() are
reported as the allocation site instead of the real one.

Thanks to Valdis Kletnieks for reporting the problem and Steven Rostedt for
the original idea.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h |  7 +++++++
 mm/slab.c            | 29 ++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1fb77a9cc14..8cf52939d0a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -76,7 +76,14 @@ struct cache_sizes {
 	kmem_cache_t	*cs_dmacachep;
 };
 extern struct cache_sizes malloc_sizes[];
+
+#ifndef CONFIG_DEBUG_SLAB
 extern void *__kmalloc(size_t, gfp_t);
+#else
+extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
+#define __kmalloc(size, flags) \
+    __kmalloc_track_caller(size, flags, __builtin_return_address(0))
+#endif
 
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
diff --git a/mm/slab.c b/mm/slab.c
index 6fbd6a1cdeb..67527268b01 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2687,7 +2687,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	return objp;
 }
 
-static inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static __always_inline void *
+__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2698,7 +2699,7 @@ static inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	objp = ____cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-					    __builtin_return_address(0));
+					    caller);
 	prefetchw(objp);
 	return objp;
 }
@@ -2927,7 +2928,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	return __cache_alloc(cachep, flags);
+	return __cache_alloc(cachep, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
@@ -3041,7 +3042,8 @@ EXPORT_SYMBOL(kmalloc_node);
  * platforms.  For example, on i386, it means that the memory must come
  * from the first 16MB.
  */
-void *__kmalloc(size_t size, gfp_t flags)
+static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
+					  void *caller)
 {
 	struct kmem_cache *cachep;
 
@@ -3053,10 +3055,27 @@ void *__kmalloc(size_t size, gfp_t flags)
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
 		return NULL;
-	return __cache_alloc(cachep, flags);
+	return __cache_alloc(cachep, flags, caller);
+}
+
+#ifndef CONFIG_DEBUG_SLAB
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+	return __do_kmalloc(size, flags, NULL);
 }
 EXPORT_SYMBOL(__kmalloc);
 
+#else
+
+void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+{
+	return __do_kmalloc(size, flags, caller);
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#endif
+
 #ifdef CONFIG_SMP
 /**
  * __alloc_percpu - allocate one copy of the object for every present
-- 
cgit v1.2.3-70-g09d2


From a70773ddb96b74c7afe5a5bc859ba45e3d02899e Mon Sep 17 00:00:00 2001
From: "Randy.Dunlap" <rdunlap@xenotime.net>
Date: Wed, 1 Feb 2006 03:05:52 -0800
Subject: [PATCH] mm/slab: add kernel-doc for one function

Fix kernel-doc for calculate_slab_order().

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 67527268b01..afe9c5f8c57 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1559,8 +1559,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 }
 
 /**
- * calculate_slab_order - calculate size (page order) of slabs and the number
- *                        of objects per slab.
+ * calculate_slab_order - calculate size (page order) of slabs
+ * @cachep: pointer to the cache that is being created
+ * @size: size of objects to be created in this cache.
+ * @align: required alignment for the objects.
+ * @flags: slab allocation flags
+ *
+ * Also calculates the number of objects per slab.
  *
  * This could be made much more intelligent.  For now, try to avoid using
  * high order pages for slabs.  When the gfp() functions are more friendly
-- 
cgit v1.2.3-70-g09d2


From ee13d785eac1fbe7e79ecca77bf7e902734a0b30 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 1 Feb 2006 03:05:53 -0800
Subject: [PATCH] slab: fix sparse warning

mm/slab.c:1522:13: error: incompatible types for operation (&)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index afe9c5f8c57..71370256a7e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1571,8 +1571,8 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep, size_t size,
-					  size_t align, gfp_t flags)
+static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
 
-- 
cgit v1.2.3-70-g09d2


From 00ac59adfca8f2f339beb0b67054e786c275553e Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Fri, 3 Feb 2006 21:51:14 +0100
Subject: [PATCH] x86_64: Fix memory policy build without CONFIG_HUGETLBFS

> mm/mempolicy.c: In function `huge_zonelist':
> mm/mempolicy.c:1045: error: `HPAGE_SHIFT' undeclared (first use in this function)
> mm/mempolicy.c:1045: error: (Each undeclared identifier is reported only once
> mm/mempolicy.c:1045: error: for each function it appears in.)
> make[1]: *** [mm/mempolicy.o] Error 1

Need to wrap huge_zonelist function with CONFIG_HUGETLBFS.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27da6d5c77b..3bd7fb7e4b7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1159,6 +1159,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		return interleave_nodes(pol);
 }
 
+#ifdef CONFIG_HUGETLBFS
 /* Return a zonelist suitable for a huge page allocation. */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 {
@@ -1172,6 +1173,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 	}
 	return zonelist_policy(GFP_HIGHUSER, pol);
 }
+#endif
 
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
-- 
cgit v1.2.3-70-g09d2


From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 4 Feb 2006 23:27:36 -0800
Subject: [PATCH] percpu data: only iterate over possible CPUs

percpu_data blindly allocates bootmem memory to store NR_CPUS instances of
cpudata, instead of allocating memory only for possible cpus.

As a preparation for changing that, we need to convert various 0 -> NR_CPUS
loops to use for_each_cpu().

(The above only applies to users of asm-generic/percpu.h.  powerpc has gone it
alone and is presently only allocating memory for present CPUs, so it's
currently corrupting memory).

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <axboe@suse.de>
Cc: Anton Blanchard <anton@samba.org>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c |  2 +-
 block/ll_rw_blk.c      |  2 +-
 drivers/scsi/scsi.c    |  2 +-
 fs/file.c              |  3 +--
 kernel/sched.c         |  2 +-
 mm/page_alloc.c        | 10 ++++++----
 net/core/dev.c         |  2 +-
 net/core/utils.c       |  4 ++--
 net/ipv4/proc.c        |  2 +-
 net/ipv6/proc.c        |  2 +-
 net/socket.c           |  2 +-
 11 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index d661703ac1c..63f39a7e2c9 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -138,7 +138,7 @@ static int __init check_nmi_watchdog(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index f9fc07efd2d..e5aad831458 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3453,7 +3453,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 245ca99a641..c551bb84dbf 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -1245,7 +1245,7 @@ static int __init init_scsi(void)
 	if (error)
 		goto cleanup_sysctl;
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(scsi_done_q, i));
 
 	devfs_mk_dir("scsi");
diff --git a/fs/file.c b/fs/file.c
index fd066b261c7..cea7cbea11d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	/* Really early - can't use for_each_cpu */
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index f77f23f8f47..839466fdfb4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6109,7 +6109,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d..dde04ff4be3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
-	memset(ret, 0, sizeof(*ret));
+	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
+		if (!cpu_isset(cpu, *cpumask))
+			continue;
+
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
 		cpu = next_cpu(cpu, *cpumask);
 
-		if (cpu < NR_CPUS)
+		if (likely(cpu < NR_CPUS))
 			prefetch(&per_cpu(page_states, cpu));
 
 		out = (unsigned long *)ret;
@@ -1886,8 +1889,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static struct per_cpu_pageset
-	boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
 
 /*
  * Dynamically allocate memory for the
diff --git a/net/core/dev.c b/net/core/dev.c
index ffb82073056..2afb0de9532 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3237,7 +3237,7 @@ static int __init net_dev_init(void)
 	 *	Initialise the packet receive queues.
 	 */
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct softnet_data *queue;
 
 		queue = &per_cpu(softnet_data, i);
diff --git a/net/core/utils.c b/net/core/utils.c
index ac1d1fcf867..fdc4f38bc46 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -121,7 +121,7 @@ void __init net_random_init(void)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, i+jiffies);
 	}
@@ -133,7 +133,7 @@ static int net_random_reseed(void)
 	unsigned long seed[NR_CPUS];
 
 	get_random_bytes(seed, sizeof(seed));
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, seed[i]);
 	}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 39d49dc333a..1b167c4bb3b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -49,7 +49,7 @@ static int fold_prot_inuse(struct proto *proto)
 	int res = 0;
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 50a13e75d70..4238b1ed886 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,7 @@ static int fold_prot_inuse(struct proto *proto)
 	int res = 0;
 	int cpu;
 
-	for (cpu=0; cpu<NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
diff --git a/net/socket.c b/net/socket.c
index b38a263853c..a00851f981d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2078,7 +2078,7 @@ void socket_seq_show(struct seq_file *seq)
 	int cpu;
 	int counter = 0;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		counter += per_cpu(sockets_in_use, cpu);
 
 	/* It can be negative, by the way. 8) */
-- 
cgit v1.2.3-70-g09d2


From 64b4a954b03a1153fb8ae38d6ffbd991e01a1e80 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 4 Feb 2006 23:27:55 -0800
Subject: [PATCH] hugetlb: add comment explaining reasons for Bus Errors

I just spent some time researching a Bus Error.  Turns out that the huge
page fault handler can return VM_FAULT_SIGBUS for various conditions where
no huge page is available.

Add a note explaining the reasoning in the source.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b..ceb3ebb3c39 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -444,6 +444,15 @@ retry:
 		page = alloc_huge_page(vma, address);
 		if (!page) {
 			hugetlb_put_quota(mapping);
+			/*
+		 	 * No huge pages available. So this is an OOM
+			 * condition but we do not want to trigger the OOM
+			 * killer, so we return VM_FAULT_SIGBUS.
+			 *
+			 * A program using hugepages may fault with Bus Error
+			 * because no huge pages are available in the cpuset, per
+			 * memory policy or because all are in use!
+			 */
 			goto out;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 2e1217cf96b54d3b2d0162930608159e73507fbf Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 4 Feb 2006 23:27:56 -0800
Subject: [PATCH] NUMA slab locking fixes: move color_next to l3

colour_next is used as an index to add a colouring offset to a new slab in the
cache (colour_off * colour_next).  Now with the NUMA aware slab allocator, it
makes sense to colour slabs added on the same node sequentially with
colour_next.

This patch moves the colouring index "colour_next" per-node by placing it on
kmem_list3 rather than kmem_cache.

This also helps simplify locking for CPU up and down paths.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 71370256a7e..2317096166d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -294,6 +294,7 @@ struct kmem_list3 {
 	unsigned long next_reap;
 	int free_touched;
 	unsigned int free_limit;
+	unsigned int colour_next;	/* Per-node cache coloring */
 	spinlock_t list_lock;
 	struct array_cache *shared;	/* shared per node */
 	struct array_cache **alien;	/* on other nodes */
@@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	INIT_LIST_HEAD(&parent->slabs_free);
 	parent->shared = NULL;
 	parent->alien = NULL;
+	parent->colour_next = 0;
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
@@ -390,7 +392,6 @@ struct kmem_cache {
 
 	size_t colour;		/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
-	unsigned int colour_next;	/* cache colouring */
 	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
 	unsigned int dflags;	/* dynamic flags */
@@ -1119,7 +1120,6 @@ void __init kmem_cache_init(void)
 		BUG();
 
 	cache_cache.colour = left_over / cache_cache.colour_off;
-	cache_cache.colour_next = 0;
 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
 				      sizeof(struct slab), cache_line_size());
 
@@ -2324,18 +2324,19 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		 */
 		ctor_flags |= SLAB_CTOR_ATOMIC;
 
-	/* About to mess with non-constant members - lock. */
+	/* Take the l3 list lock to change the colour_next on this node */
 	check_irq_off();
-	spin_lock(&cachep->spinlock);
+	l3 = cachep->nodelists[nodeid];
+	spin_lock(&l3->list_lock);
 
 	/* Get colour for the slab, and cal the next value. */
-	offset = cachep->colour_next;
-	cachep->colour_next++;
-	if (cachep->colour_next >= cachep->colour)
-		cachep->colour_next = 0;
-	offset *= cachep->colour_off;
+	offset = l3->colour_next;
+	l3->colour_next++;
+	if (l3->colour_next >= cachep->colour)
+		l3->colour_next = 0;
+	spin_unlock(&l3->list_lock);
 
-	spin_unlock(&cachep->spinlock);
+	offset *= cachep->colour_off;
 
 	check_irq_off();
 	if (local_flags & __GFP_WAIT)
@@ -2367,7 +2368,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
-	l3 = cachep->nodelists[nodeid];
 	spin_lock(&l3->list_lock);
 
 	/* Make slab active. */
-- 
cgit v1.2.3-70-g09d2


From ca3b9b91735316f0ec7f01976f85842e0bfe5c6e Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 4 Feb 2006 23:27:58 -0800
Subject: [PATCH] NUMA slab locking fixes: irq disabling from cahep->spinlock
 to l3 lock

Earlier, we had to disable on chip interrupts while taking the
cachep->spinlock because, at cache_grow, on every addition of a slab to a slab
cache, we incremented colour_next which was protected by the cachep->spinlock,
and cache_grow could occur at interrupt context.  Since, now we protect the
per-node colour_next with the node's list_lock, we do not need to disable on
chip interrupts while taking the per-cache spinlock, but we just need to
disable interrupts when taking the per-node kmem_list3 list_lock.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2317096166d..d3f68543f9f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -987,7 +987,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
-			spin_lock_irq(&cachep->spinlock);
+			spin_lock(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
@@ -996,7 +996,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			if (!l3)
 				goto unlock_cache;
 
-			spin_lock(&l3->list_lock);
+			spin_lock_irq(&l3->list_lock);
 
 			/* Free limit for this kmem_list3 */
 			l3->free_limit -= cachep->batchcount;
@@ -1004,7 +1004,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			if (!cpus_empty(mask)) {
-				spin_unlock(&l3->list_lock);
+				spin_unlock_irq(&l3->list_lock);
 				goto unlock_cache;
 			}
 
@@ -1023,13 +1023,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			/* free slabs belonging to this node */
 			if (__node_shrink(cachep, node)) {
 				cachep->nodelists[node] = NULL;
-				spin_unlock(&l3->list_lock);
+				spin_unlock_irq(&l3->list_lock);
 				kfree(l3);
 			} else {
-				spin_unlock(&l3->list_lock);
+				spin_unlock_irq(&l3->list_lock);
 			}
 		      unlock_cache:
-			spin_unlock_irq(&cachep->spinlock);
+			spin_unlock(&cachep->spinlock);
 			kfree(nc);
 		}
 		mutex_unlock(&cache_chain_mutex);
@@ -2011,18 +2011,18 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock(&l3->list_lock);
+			spin_lock_irq(&l3->list_lock);
 			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock(&l3->list_lock);
+			spin_unlock_irq(&l3->list_lock);
 			if (l3->alien)
 				drain_alien_cache(cachep, l3);
 		}
 	}
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 }
 
 static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -2338,7 +2338,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 
 	offset *= cachep->colour_off;
 
-	check_irq_off();
 	if (local_flags & __GFP_WAIT)
 		local_irq_enable();
 
@@ -2725,6 +2724,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	BUG_ON(!l3);
 
       retry:
+	check_irq_off();
 	spin_lock(&l3->list_lock);
 	entry = l3->slabs_partial.next;
 	if (entry == &l3->slabs_partial) {
@@ -3304,11 +3304,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
@@ -3564,8 +3564,7 @@ static int s_show(struct seq_file *m, void *p)
 	int node;
 	struct kmem_list3 *l3;
 
-	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
@@ -3573,7 +3572,8 @@ static int s_show(struct seq_file *m, void *p)
 		if (!l3)
 			continue;
 
-		spin_lock(&l3->list_lock);
+		check_irq_on();
+		spin_lock_irq(&l3->list_lock);
 
 		list_for_each(q, &l3->slabs_full) {
 			slabp = list_entry(q, struct slab, list);
@@ -3600,7 +3600,7 @@ static int s_show(struct seq_file *m, void *p)
 		free_objects += l3->free_objects;
 		shared_avail += l3->shared->avail;
 
-		spin_unlock(&l3->list_lock);
+		spin_unlock_irq(&l3->list_lock);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
@@ -3644,7 +3644,7 @@ static int s_show(struct seq_file *m, void *p)
 	}
 #endif
 	seq_putc(m, '\n');
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 4 Feb 2006 23:27:59 -0800
Subject: [PATCH] NUMA slab locking fixes: fix cpu down and up locking

This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab
allocator.  Sonny Rao <sonny@burdell.org> reported problems sometime back on
POWER5 boxes, when the last cpu on the nodes were being offlined.  We could
not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not
being updated on cpu down.  Since that issue is now fixed, we can reproduce
Sonny's problems on x86_64 NUMA, and here is the fix.

The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go
down, the array_caches (shared, alien) and the kmem_list3 of the node were
being freed (kfree) with the kmem_list3 lock held.  If the l3 or the
array_caches were to come from the same cache being cleared, we hit on
badness.

This patch cleans up the locking in cpu_up and cpu_down path.  We cannot
really free l3 on cpu down because, there is no node offlining yet and even
though a cpu is not yet up, node local memory can be allocated for it.  So l3s
are usually allocated at keme_cache_create and destroyed at
kmem_cache_destroy.  Hence, we don't need cachep->spinlock protection to get
to the cachep->nodelist[nodeid] either.

Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4
dbench process running all the time.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 123 +++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 85 insertions(+), 38 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d3f68543f9f..9cc049a942c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 	}
 }
 
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
 	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
 
 	for_each_online_node(i) {
-		ac = l3->alien[i];
+		ac = alien[i];
 		if (ac) {
 			spin_lock_irqsave(&ac->lock, flags);
 			__drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
 }
 #else
 #define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+#define drain_alien_cache(cachep, alien) do { } while (0)
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
+				/*
+				 * The l3s don't come and go as CPUs come and
+				 * go.  cache_chain_mutex is sufficient
+				 * protection here.
+				 */
 				cachep->nodelists[node] = l3;
 			}
 
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			struct array_cache *shared;
+			struct array_cache **alien;
 
 			nc = alloc_arraycache(node, cachep->limit,
-					      cachep->batchcount);
+						cachep->batchcount);
 			if (!nc)
 				goto bad;
+			shared = alloc_arraycache(node,
+					cachep->shared * cachep->batchcount,
+					0xbaadf00d);
+			if (!shared)
+				goto bad;
+#ifdef CONFIG_NUMA
+			alien = alloc_alien_cache(node, cachep->limit);
+			if (!alien)
+				goto bad;
+#endif
 			cachep->array[cpu] = nc;
 
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
-			if (!l3->shared) {
-				if (!(nc = alloc_arraycache(node,
-							    cachep->shared *
-							    cachep->batchcount,
-							    0xbaadf00d)))
-					goto bad;
 
-				/* we are serialised from CPU_DEAD or
-				   CPU_UP_CANCELLED by the cpucontrol lock */
-				l3->shared = nc;
+			spin_lock_irq(&l3->list_lock);
+			if (!l3->shared) {
+				/*
+				 * We are serialised from CPU_DEAD or
+				 * CPU_UP_CANCELLED by the cpucontrol lock
+				 */
+				l3->shared = shared;
+				shared = NULL;
 			}
+#ifdef CONFIG_NUMA
+			if (!l3->alien) {
+				l3->alien = alien;
+				alien = NULL;
+			}
+#endif
+			spin_unlock_irq(&l3->list_lock);
+
+			kfree(shared);
+			free_alien_cache(alien);
 		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+		/*
+		 * Even if all the cpus of a node are down, we don't free the
+		 * kmem_list3 of any cache. This to avoid a race between
+		 * cpu_down, and a kmalloc allocation from another cpu for
+		 * memory from the node of the cpu going down.  The list3
+		 * structure is usually allocated from kmem_cache_create() and
+		 * gets destroyed at kmem_cache_destroy().
+		 */
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		mutex_lock(&cache_chain_mutex);
 
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			struct array_cache *shared;
+			struct array_cache **alien;
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
-			spin_lock(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
 			l3 = cachep->nodelists[node];
 
 			if (!l3)
-				goto unlock_cache;
+				goto free_array_cache;
 
 			spin_lock_irq(&l3->list_lock);
 
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 
 			if (!cpus_empty(mask)) {
 				spin_unlock_irq(&l3->list_lock);
-				goto unlock_cache;
+				goto free_array_cache;
 			}
 
-			if (l3->shared) {
+			shared = l3->shared;
+			if (shared) {
 				free_block(cachep, l3->shared->entry,
 					   l3->shared->avail, node);
-				kfree(l3->shared);
 				l3->shared = NULL;
 			}
-			if (l3->alien) {
-				drain_alien_cache(cachep, l3);
-				free_alien_cache(l3->alien);
-				l3->alien = NULL;
-			}
 
-			/* free slabs belonging to this node */
-			if (__node_shrink(cachep, node)) {
-				cachep->nodelists[node] = NULL;
-				spin_unlock_irq(&l3->list_lock);
-				kfree(l3);
-			} else {
-				spin_unlock_irq(&l3->list_lock);
+			alien = l3->alien;
+			l3->alien = NULL;
+
+			spin_unlock_irq(&l3->list_lock);
+
+			kfree(shared);
+			if (alien) {
+				drain_alien_cache(cachep, alien);
+				free_alien_cache(alien);
 			}
-		      unlock_cache:
-			spin_unlock(&cachep->spinlock);
+free_array_cache:
 			kfree(nc);
 		}
+		/*
+		 * In the previous loop, all the objects were freed to
+		 * the respective cache's slabs,  now we can go ahead and
+		 * shrink each nodelist to its limit.
+		 */
+		list_for_each_entry(cachep, &cache_chain, next) {
+			l3 = cachep->nodelists[node];
+			if (!l3)
+				continue;
+			spin_lock_irq(&l3->list_lock);
+			/* free slabs belonging to this node */
+			__node_shrink(cachep, node);
+			spin_unlock_irq(&l3->list_lock);
+		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
 #endif
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
-	spin_lock(&cachep->spinlock);
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 			drain_array_locked(cachep, l3->shared, 1, node);
 			spin_unlock_irq(&l3->list_lock);
 			if (l3->alien)
-				drain_alien_cache(cachep, l3);
+				drain_alien_cache(cachep, l3->alien);
 		}
 	}
-	spin_unlock(&cachep->spinlock);
 }
 
 static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)
 
 		l3 = searchp->nodelists[numa_node_id()];
 		if (l3->alien)
-			drain_alien_cache(searchp, l3);
+			drain_alien_cache(searchp, l3->alien);
 		spin_lock_irq(&l3->list_lock);
 
 		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p)
 			num_slabs++;
 		}
 		free_objects += l3->free_objects;
-		shared_avail += l3->shared->avail;
+		if (l3->shared)
+			shared_avail += l3->shared->avail;
 
 		spin_unlock_irq(&l3->list_lock);
 	}
-- 
cgit v1.2.3-70-g09d2


From 7a21ef6fe902ac0ad53b45af6851ae5ec3a64299 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 5 Feb 2006 11:26:38 -0800
Subject: mm/slab.c (non-NUMA): Fix compile warning and clean up code

The non-NUMA case would do an unmatched "free_alien_cache()" on an alien
pointer that had never been allocated.

It might not matter from a code generation standpoint (since in the
non-NUMA case, the code doesn't actually _do_ anything), but it not only
results in a compiler warning, it's really really ugly too.

Fix the compiler warning by just having a matching dummy allocation.
That also avoids an unnecessary #ifdef in the code.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 9cc049a942c..d66c2b0d971 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -900,12 +900,18 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 	}
 }
 #else
-#define alloc_alien_cache(node, limit) do { } while (0)
+
 #define drain_alien_cache(cachep, alien) do { } while (0)
 
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+	return (struct array_cache **) 0x01020304ul;
+}
+
 static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
+
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -970,11 +976,10 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 					0xbaadf00d);
 			if (!shared)
 				goto bad;
-#ifdef CONFIG_NUMA
+
 			alien = alloc_alien_cache(node, cachep->limit);
 			if (!alien)
 				goto bad;
-#endif
 			cachep->array[cpu] = nc;
 
 			l3 = cachep->nodelists[node];
-- 
cgit v1.2.3-70-g09d2