Merge branch 'master' of /usr/src/ntfs-2.6/

author: Anton Altaparmakov <aia21@cantab.net> 2006-01-19 16:39:33 +0000
committer: Anton Altaparmakov <aia21@cantab.net> 2006-01-19 16:39:33 +0000
commit: 944d79559d154c12becde0dab327016cf438f46c (patch)
tree: 50c101806f4d3b6585222dda060559eb4f3e005a /mm
parent: d087e4bdd24ebe3ae3d0b265b6573ec901af4b4b (diff)
parent: 0f36b018b2e314d45af86449f1a97facb1fbe300 (diff)
34 files changed, 2898 insertions, 1306 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8..a9cb80ae640 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
 
 config FLATMEM_MANUAL
 	bool "Flat Memory"
-	depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+	depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
 	help
 	  This option allows you to change some of the ways that
 	  Linux manages its memory internally.  Most users will
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
 	default "4096" if ARM && !CPU_CACHE_VIPT
 	default "4096" if PARISC && !PA20
 	default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+	depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f2..9aa03fa1dcc 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o slab.o swap.o truncate.o vmscan.o \
-			   prio_tree.o $(mmu-y)
+			   readahead.o swap.o truncate.o vmscan.o \
+			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4e..35c32290f71 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 		unsigned long v = ~map[i / BITS_PER_LONG];
 
 		if (gofast && v == ~0UL) {
-			int j, order;
+			int order;
 
 			page = pfn_to_page(pfn);
 			count += BITS_PER_LONG;
-			__ClearPageReserved(page);
 			order = ffs(BITS_PER_LONG) - 1;
-			set_page_refs(page, order);
-			for (j = 1; j < BITS_PER_LONG; j++) {
-				if (j + 16 < BITS_PER_LONG)
-					prefetchw(page + j + 16);
-				__ClearPageReserved(page + j);
-				set_page_count(page + j, 0);
-			}
-			__free_pages(page, order);
+			__free_pages_bootmem(page, order);
 			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
 		} else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 			for (m = 1; m && i < idx; m<<=1, page++, i++) {
 				if (v & m) {
 					count++;
-					__ClearPageReserved(page);
-					set_page_refs(page, 0);
-					__free_page(page);
+					__free_pages_bootmem(page, 0);
 				}
 			}
 		} else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 	count = 0;
 	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
 		count++;
-		__ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
+		__free_pages_bootmem(page, 0);
 	}
 	total += count;
 	bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
-				unsigned long limit)
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
 	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						 align, goal, limit)))
+						 align, goal, 0)))
 			return(ptr);
 
 	/*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
 }
 
 
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
-				     unsigned long goal, unsigned long limit)
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
+				   unsigned long goal)
 {
 	void *ptr;
 
-	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return (ptr);
 
-	return __alloc_bootmem_limit(size, align, goal, limit);
+	return __alloc_bootmem(size, align, goal);
+}
+
+#define LOW32LIMIT 0xffffffff
+
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+	pg_data_t *pgdat = pgdat_list;
+	void *ptr;
+
+	for_each_pgdat(pgdat)
+		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						 align, goal, LOW32LIMIT)))
+			return(ptr);
+
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of low memory");
+	return NULL;
 }
 
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+				       unsigned long align, unsigned long goal)
+{
+	return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5a..d257c89e770 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 	if (!file)
 		return -EBADF;
 
+	if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+		ret = -ESPIPE;
+		goto out;
+	}
+
 	mapping = file->f_mapping;
 	if (!mapping || len < 0) {
 		ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde15..44da3d47699 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/aio.h>
+#include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
- *  ->i_sem
+ *  ->i_mutex
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->lock_page		(access_process_vm)
  *
  *  ->mmap_sem
- *    ->i_sem			(msync)
+ *    ->i_mutex			(msync)
  *
- *  ->i_sem
+ *  ->i_mutex
  *    ->i_alloc_sem             (various)
  *
  *  ->inode_lock
@@ -93,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
+ *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
@@ -276,11 +278,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
  * integrity" operation.  It waits upon in-flight writeout before starting and
  * waiting upon new writeout.  If there was an IO error, return it.
  *
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
  * it is otherwise livelockable.
  */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count)
+			loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +292,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 		return 0;
 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 	if (ret == 0) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	if (ret == 0)
 		ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +303,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range);
 
 /*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
-static int sync_page_range_nolock(struct inode *inode,
-				  struct address_space *mapping,
-				  loff_t pos, size_t count)
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +323,7 @@ static int sync_page_range_nolock(struct inode *inode,
 		ret = wait_on_page_writeback_range(mapping, start, end);
 	return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 
 /**
  * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +345,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
 {
-	int retval = 0;
+	int err = 0;
 
 	if (mapping->nrpages) {
-		retval = filemap_fdatawrite(mapping);
-		if (retval == 0)
-			retval = filemap_fdatawait(mapping);
+		err = filemap_fdatawrite(mapping);
+		/*
+		 * Even if the above returned error, the pages may be
+		 * written partially (e.g. -ENOSPC), so we wait for it.
+		 * But the -EIO is special case, it may indicate the worst
+		 * thing (e.g. bug) happened, so we avoid waiting for it.
+		 */
+		if (err != -EIO) {
+			int err2 = filemap_fdatawait(mapping);
+			if (!err)
+				err = err2;
+		}
 	}
-	return retval;
+	return err;
 }
+EXPORT_SYMBOL(filemap_write_and_wait);
 
 int filemap_write_and_wait_range(struct address_space *mapping,
 				 loff_t lstart, loff_t lend)
 {
-	int retval = 0;
+	int err = 0;
 
 	if (mapping->nrpages) {
-		retval = __filemap_fdatawrite_range(mapping, lstart, lend,
-						    WB_SYNC_ALL);
-		if (retval == 0)
-			retval = wait_on_page_writeback_range(mapping,
-						    lstart >> PAGE_CACHE_SHIFT,
-						    lend >> PAGE_CACHE_SHIFT);
+		err = __filemap_fdatawrite_range(mapping, lstart, lend,
+						 WB_SYNC_ALL);
+		/* See comment of filemap_write_and_wait() */
+		if (err != -EIO) {
+			int err2 = wait_on_page_writeback_range(mapping,
+						lstart >> PAGE_CACHE_SHIFT,
+						lend >> PAGE_CACHE_SHIFT);
+			if (!err)
+				err = err2;
+		}
 	}
-	return retval;
+	return err;
 }
 
 /*
@@ -555,11 +571,12 @@ repeat:
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
 			read_unlock_irq(&mapping->tree_lock);
-			lock_page(page);
+			__lock_page(page);
 			read_lock_irq(&mapping->tree_lock);
 
 			/* Has the page been truncated while we slept? */
-			if (page->mapping != mapping || page->index != offset) {
+			if (unlikely(page->mapping != mapping ||
+				     page->index != offset)) {
 				unlock_page(page);
 				page_cache_release(page);
 				goto repeat;
@@ -831,8 +848,13 @@ readpage:
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
-		if (unlikely(error))
+		if (unlikely(error)) {
+			if (error == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				goto find_page;
+			}
 			goto readpage_error;
+		}
 
 		if (!PageUptodate(page)) {
 			lock_page(page);
@@ -1152,26 +1174,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *page; 
-	int error;
+	int ret;
 
-	page = page_cache_alloc_cold(mapping);
-	if (!page)
-		return -ENOMEM;
+	do {
+		page = page_cache_alloc_cold(mapping);
+		if (!page)
+			return -ENOMEM;
+
+		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+		if (ret == 0)
+			ret = mapping->a_ops->readpage(file, page);
+		else if (ret == -EEXIST)
+			ret = 0; /* losing race to add is OK */
 
-	error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-	if (!error) {
-		error = mapping->a_ops->readpage(file, page);
 		page_cache_release(page);
-		return error;
-	}
 
-	/*
-	 * We arrive here in the unlikely event that someone 
-	 * raced with us and added our page to the cache first
-	 * or we are out of memory for radix-tree nodes.
-	 */
-	page_cache_release(page);
-	return error == -EEXIST ? 0 : error;
+	} while (ret == AOP_TRUNCATED_PAGE);
+		
+	return ret;
 }
 
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1351,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1358,10 +1382,14 @@ page_not_uptodate:
 		goto success;
 	}
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1444,10 +1472,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1470,10 +1502,14 @@ page_not_uptodate:
 	}
 
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1858,7 +1894,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	/*
 	 * Sync the fs metadata but not the minor inode changes and
 	 * of course not the data as we did direct DMA for the IO.
-	 * i_sem is held, which protects generic_osync_inode() from
+	 * i_mutex is held, which protects generic_osync_inode() from
 	 * livelocking.
 	 */
 	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -1934,12 +1970,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
 			loff_t isize = i_size_read(inode);
+
+			if (status != AOP_TRUNCATED_PAGE)
+				unlock_page(page);
+			page_cache_release(page);
+			if (status == AOP_TRUNCATED_PAGE)
+				continue;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again.
 			 */
-			unlock_page(page);
-			page_cache_release(page);
 			if (pos + bytes > isize)
 				vmtruncate(inode, isize);
 			break;
@@ -1952,6 +1992,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 						cur_iov, iov_base, bytes);
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (status == AOP_TRUNCATED_PAGE) {
+			page_cache_release(page);
+			continue;
+		}
 		if (likely(copied > 0)) {
 			if (!status)
 				status = copied;
@@ -2066,7 +2110,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		goto out;
 
-	inode_update_time(inode, 1);
+	file_update_time(file);
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2153,10 +2197,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2178,9 +2222,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
 	struct iovec local_iov = { .iov_base = (void __user *)buf,
 					.iov_len = count };
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2214,9 +2258,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err;
@@ -2230,7 +2274,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 EXPORT_SYMBOL(generic_file_writev);
 
 /*
- * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
  */
 static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29..b960ac8e591 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 	*ppos = pos;
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_sem.
+	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	loff_t pos;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	if (!access_ok(VERIFY_READ, buf, len)) {
 		ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	if (ret)
 		goto out_backing;
 
-	inode_update_time(inode, 1);
+	file_update_time(filp);
 
 	ret = __xip_file_write (filp, buf, count, pos, ppos);
 
  out_backing:
 	current->backing_dev_info = NULL;
  out_up:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471..b21d78c941b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,9 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -36,18 +39,22 @@ static void enqueue_huge_page(struct page *page)
 	free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+				unsigned long address)
 {
 	int nid = numa_node_id();
 	struct page *page = NULL;
+	struct zonelist *zonelist = huge_zonelist(vma, address);
+	struct zone **z;
 
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
+	for (z = zonelist->zones; *z; z++) {
+		nid = (*z)->zone_pgdat->node_id;
+		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+		    !list_empty(&hugepage_freelists[nid]))
+			break;
 	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
+
+	if (*z) {
 		page = list_entry(hugepage_freelists[nid].next,
 				  struct page, lru);
 		list_del(&page->lru);
@@ -85,13 +92,13 @@ void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
 	int i;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page();
+	page = dequeue_huge_page(vma, addr);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
@@ -194,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	spin_lock(&hugetlb_lock);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
-		struct page *page = dequeue_huge_page();
+		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
 		update_and_free_page(page);
@@ -261,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
 	.nopage = hugetlb_nopage,
 };
 
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+				int writable)
 {
 	pte_t entry;
 
-	if (vma->vm_flags & VM_WRITE) {
+	if (writable) {
 		entry =
 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	} else {
@@ -277,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
 	return entry;
 }
 
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+				   unsigned long address, pte_t *ptep)
+{
+	pte_t entry;
+
+	entry = pte_mkwrite(pte_mkdirty(*ptep));
+	ptep_set_access_flags(vma, address, ptep, entry, 1);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+}
+
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *vma)
 {
 	pte_t *src_pte, *dst_pte, entry;
 	struct page *ptepage;
 	unsigned long addr;
+	int cow;
+
+	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		src_pte = huge_pte_offset(src, addr);
@@ -294,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
 		if (!pte_none(*src_pte)) {
+			if (cow)
+				ptep_set_wrprotect(src, addr, src_pte);
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@@ -345,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 }
 
-static struct page *find_lock_huge_page(struct address_space *mapping,
-			unsigned long idx)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep, pte_t pte)
 {
-	struct page *page;
-	int err;
-	struct inode *inode = mapping->host;
-	unsigned long size;
+	struct page *old_page, *new_page;
+	int i, avoidcopy;
 
-retry:
-	page = find_lock_page(mapping, idx);
-	if (page)
-		goto out;
+	old_page = pte_page(pte);
 
-	/* Check to make sure the mapping hasn't been truncated */
-	size = i_size_read(inode) >> HPAGE_SHIFT;
-	if (idx >= size)
-		goto out;
+	/* If no-one else is actually using this page, avoid the copy
+	 * and just make the page writable */
+	avoidcopy = (page_count(old_page) == 1);
+	if (avoidcopy) {
+		set_huge_ptep_writable(vma, address, ptep);
+		return VM_FAULT_MINOR;
+	}
 
-	if (hugetlb_get_quota(mapping))
-		goto out;
-	page = alloc_huge_page();
-	if (!page) {
-		hugetlb_put_quota(mapping);
-		goto out;
+	page_cache_get(old_page);
+	new_page = alloc_huge_page(vma, address);
+
+	if (!new_page) {
+		page_cache_release(old_page);
+
+		/* Logically this is OOM, not a SIGBUS, but an OOM
+		 * could cause the kernel to go killing other
+		 * processes which won't help the hugepage situation
+		 * at all (?) */
+		return VM_FAULT_SIGBUS;
 	}
 
-	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
-	if (err) {
-		put_page(page);
-		hugetlb_put_quota(mapping);
-		if (err == -EEXIST)
-			goto retry;
-		page = NULL;
+	spin_unlock(&mm->page_table_lock);
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+		copy_user_highpage(new_page + i, old_page + i,
+				   address + i*PAGE_SIZE);
+	spin_lock(&mm->page_table_lock);
+
+	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	if (likely(pte_same(*ptep, pte))) {
+		/* Break COW */
+		set_huge_pte_at(mm, address, ptep,
+				make_huge_pte(vma, new_page, 1));
+		/* Make the old page be freed below */
+		new_page = old_page;
 	}
-out:
-	return page;
+	page_cache_release(new_page);
+	page_cache_release(old_page);
+	return VM_FAULT_MINOR;
 }
 
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep, int write_access)
 {
 	int ret = VM_FAULT_SIGBUS;
 	unsigned long idx;
 	unsigned long size;
-	pte_t *pte;
 	struct page *page;
 	struct address_space *mapping;
-
-	pte = huge_pte_alloc(mm, address);
-	if (!pte)
-		goto out;
+	pte_t new_pte;
 
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_lock_huge_page(mapping, idx);
-	if (!page)
-		goto out;
+retry:
+	page = find_lock_page(mapping, idx);
+	if (!page) {
+		if (hugetlb_get_quota(mapping))
+			goto out;
+		page = alloc_huge_page(vma, address);
+		if (!page) {
+			hugetlb_put_quota(mapping);
+			goto out;
+		}
+
+		if (vma->vm_flags & VM_SHARED) {
+			int err;
+
+			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+			if (err) {
+				put_page(page);
+				hugetlb_put_quota(mapping);
+				if (err == -EEXIST)
+					goto retry;
+				goto out;
+			}
+		} else
+			lock_page(page);
+	}
 
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto backout;
 
 	ret = VM_FAULT_MINOR;
-	if (!pte_none(*pte))
+	if (!pte_none(*ptep))
 		goto backout;
 
 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+				&& (vma->vm_flags & VM_SHARED)));
+	set_huge_pte_at(mm, address, ptep, new_pte);
+
+	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+		/* Optimization, do the COW without a second fault */
+		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+	}
+
 	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 out:
@@ -433,6 +494,33 @@ backout:
 	goto out;
 }
 
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	pte_t *ptep;
+	pte_t entry;
+	int ret;
+
+	ptep = huge_pte_alloc(mm, address);
+	if (!ptep)
+		return VM_FAULT_OOM;
+
+	entry = *ptep;
+	if (pte_none(entry))
+		return hugetlb_no_page(mm, vma, address, ptep, write_access);
+
+	ret = VM_FAULT_MINOR;
+
+	spin_lock(&mm->page_table_lock);
+	/* Check for a racing update before calling hugetlb_cow */
+	if (likely(pte_same(entry, *ptep)))
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
+}
+
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3..17256bb2f4e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-/* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+	set_page_count(page, 1);
+#else
+	int i;
+
+	/*
+	 * We need to reference all the pages for this order, otherwise if
+	 * anyone accesses one of the pages with (get/put) it will be freed.
+	 * - eg: access_process_vm()
+	 */
+	for (i = 0; i < (1 << order); i++)
+		set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+						unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a2..ae0ae3ea299 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	return 0;
 }
 
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+
+	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+		return -EINVAL;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+
+	offset = (loff_t)(start - vma->vm_start)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	endoff = (loff_t)(end - vma->vm_start - 1)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	return  vmtruncate_range(mapping->host, offset, endoff);
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_REMOVE:
+		error = madvise_remove(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  *		some pages ahead.
  *  MADV_DONTNEED - the application is finished with the given range,
  *		so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *		pages and associated backing store.
  *
  * return values:
  *  zero    - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a365..7a11ddd5060 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
 		lru_cache_add_active(new_page);
-		page_add_anon_rmap(new_page, vma, address);
+		page_add_new_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
 		new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
 out_busy:
 	return -ETXTBSY;
 }
-
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide
+	 * a way to truncate a range of blocks (punch a hole) -
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+
+	mutex_lock(&inode->i_mutex);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	mutex_unlock(&inode->i_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			goto release;
 		inc_mm_counter(mm, anon_rss);
 		lru_cache_add_active(page);
-		SetPageReferenced(page);
-		page_add_anon_rmap(page, vma, address);
+		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
 		page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
-			page_add_anon_rmap(new_page, vma, address);
+			page_add_new_anon_rmap(new_page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
@@ -2245,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
+
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a..a918f77f02f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 				  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
-	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9..73790188b0e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
 
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
 
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	if (!zl)
 		return NULL;
 	num = 0;
-	for_each_node_mask(nd, *nodes) {
-		int k;
-		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (!z->present_pages)
-				continue;
-			zl->zones[num++] = z;
-			if (k > policy_zone)
-				policy_zone = k;
-		}
-	}
+	for_each_node_mask(nd, *nodes)
+		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 	zl->zones[num] = NULL;
 	return zl;
 }
@@ -180,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 		break;
 	}
 	policy->policy = mode;
+	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
 
-/* Ensure all existing pages follow the policy. */
+static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+				unsigned long flags);
+
+/* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		void *private)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -201,8 +208,28 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		page = vm_normal_page(vma, addr, *pte);
 		if (!page)
 			continue;
+		/*
+		 * The check for PageReserved here is important to avoid
+		 * handling zero pages and other pages that may have been
+		 * marked special by the system.
+		 *
+		 * If the PageReserved would not be checked here then f.e.
+		 * the location of the zero page could have an influence
+		 * on MPOL_MF_STRICT, zero pages would be counted for
+		 * the per node stats, and there would be useless attempts
+		 * to put zero pages on the migration list.
+		 */
+		if (PageReserved(page))
+			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes))
+		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+			continue;
+
+		if (flags & MPOL_MF_STATS)
+			gather_stats(page, private);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+			migrate_page_add(page, private, flags);
+		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
@@ -210,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 }
 
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -220,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes,
+				    flags, private))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		void *private)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -237,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes,
+				    flags, private))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pgd_range(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		void *private)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -254,36 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes,
+				    flags, private))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (
+		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+		return 0;
+	return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    nodemask_t *nodes, unsigned long flags)
+		const nodemask_t *nodes, unsigned long flags, void *private)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
+	/* Clear the LRU lists so pages can be isolated */
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		lru_add_drain_all();
+
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-		if (!vma->vm_next && vma->vm_end < end)
-			return ERR_PTR(-EFAULT);
-		if (prev && prev->vm_end < vma->vm_start)
-			return ERR_PTR(-EFAULT);
-		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+			if (!vma->vm_next && vma->vm_end < end)
+				return ERR_PTR(-EFAULT);
+			if (prev && prev->vm_end < vma->vm_start)
+				return ERR_PTR(-EFAULT);
+		}
+		if (!is_vm_hugetlb_page(vma) &&
+		    ((flags & MPOL_MF_STRICT) ||
+		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+				vma_migratable(vma)))) {
 			unsigned long endvma = vma->vm_end;
+
 			if (endvma > end)
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma, start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes,
+						flags, private);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -342,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	/* Update current mems_allowed */
-	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes->bits);
-	return mpol_check_policy(mode, nodes);
-}
-
-long do_mbind(unsigned long start, unsigned long len,
-		unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
-	struct vm_area_struct *vma;
-	struct mm_struct *mm = current->mm;
-	struct mempolicy *new;
-	unsigned long end;
-	int err;
-
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
-		return -EINVAL;
-	if (start & ~PAGE_MASK)
-		return -EINVAL;
-	if (mode == MPOL_DEFAULT)
-		flags &= ~MPOL_MF_STRICT;
-	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-	end = start + len;
-	if (end < start)
+	cpuset_update_task_memory_state();
+	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 		return -EINVAL;
-	if (end == start)
-		return 0;
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-	new = mpol_new(mode, nmask);
-	if (IS_ERR(new))
-		return PTR_ERR(new);
-
-	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-			mode,nodes_addr(nodes)[0]);
-
-	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags);
-	err = PTR_ERR(vma);
-	if (!IS_ERR(vma))
-		err = mbind_range(vma, start, end, new);
-	up_write(&mm->mmap_sem);
-	mpol_free(new);
-	return err;
+	return mpol_check_policy(mode, nodes);
 }
 
 /* Set the process memory policy */
@@ -457,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -509,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 }
 
 /*
+ * page migration
+ */
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+				unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared with others.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+		if (isolate_lru_page(page))
+			list_add(&page->lru, pagelist);
+	}
+}
+
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	LIST_HEAD(pagelist);
+	int count = 0;
+	nodemask_t nodes;
+
+	nodes_andnot(nodes, *from_nodes, *to_nodes);
+
+	down_read(&mm->mmap_sem);
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+	if (!list_empty(&pagelist)) {
+		count = swap_pages(&pagelist);
+		putback_lru_pages(&pagelist);
+	}
+
+	up_read(&mm->mmap_sem);
+	return count;
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+		unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	struct mempolicy *new;
+	unsigned long end;
+	int err;
+	LIST_HEAD(pagelist);
+
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
+		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+
+	if (mode == MPOL_DEFAULT)
+		flags &= ~MPOL_MF_STRICT;
+
+	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+	end = start + len;
+
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+
+	if (mpol_check_policy(mode, nmask))
+		return -EINVAL;
+
+	new = mpol_new(mode, nmask);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
+	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+			mode,nodes_addr(nodes)[0]);
+
+	down_write(&mm->mmap_sem);
+	vma = check_range(mm, start, end, nmask,
+			  flags | MPOL_MF_INVERT, &pagelist);
+
+	err = PTR_ERR(vma);
+	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
+		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
+	up_write(&mm->mmap_sem);
+	mpol_free(new);
+	return err;
+}
+
+/*
  * User space interface with variable sized bitmaps for nodelists.
  */
 
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
@@ -602,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+		const unsigned long __user *old_nodes,
+		const unsigned long __user *new_nodes)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	nodemask_t old;
+	nodemask_t new;
+	nodemask_t task_nodes;
+	int err;
+
+	err = get_nodes(&old, old_nodes, maxnode);
+	if (err)
+		return err;
+
+	err = get_nodes(&new, new_nodes, maxnode);
+	if (err)
+		return err;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser priviledges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+	/* Is the user allowed to access the target nodes? */
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+	mmput(mm);
+	return err;
+}
+
+
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
@@ -708,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
 
@@ -768,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	return nid;
 }
 
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+	switch (policy->policy) {
+	case MPOL_INTERLEAVE:
+		return interleave_nodes(policy);
+
+	case MPOL_BIND:
+		/*
+		 * Follow bind policy behavior and start allocation at the
+		 * first node.
+		 */
+		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+	case MPOL_PREFERRED:
+		if (policy->v.preferred_node >= 0)
+			return policy->v.preferred_node;
+		/* Fall through */
+
+	default:
+		return numa_node_id();
+	}
+}
+
 /* Do static interleaving for a VMA with known offset. */
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
@@ -785,6 +1020,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
 	return nid;
 }
 
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+		 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+	if (vma) {
+		unsigned long off;
+
+		off = vma->vm_pgoff;
+		off += (addr - vma->vm_start) >> shift;
+		return offset_il_node(pol, vma, off);
+	} else
+		return interleave_nodes(pol);
+}
+
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+	if (pol->policy == MPOL_INTERLEAVE) {
+		unsigned nid;
+
+		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+	}
+	return zonelist_policy(GFP_HIGHUSER, pol);
+}
+
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -829,19 +1092,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
-		if (vma) {
-			unsigned long off;
-			off = vma->vm_pgoff;
-			off += (addr - vma->vm_start) >> PAGE_SHIFT;
-			nid = offset_il_node(pol, vma, off);
-		} else {
-			/* fall back to process interleaving */
-			nid = interleave_nodes(pol);
-		}
+
+		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -862,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
- *	Don't call cpuset_update_current_mems_allowed() unless
+ *	Don't call cpuset_update_task_memory_state() unless
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
@@ -871,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	struct mempolicy *pol = current->mempolicy;
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_current_mems_allowed();
+		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt())
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
@@ -880,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -887,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+	if (current_cpuset_is_being_rebound()) {
+		nodemask_t mems = cpuset_mems_allowed(current);
+		mpol_rebind_policy(old, &mems);
+	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
@@ -940,54 +1209,6 @@ void __mpol_free(struct mempolicy *p)
 }
 
 /*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_DEFAULT:
-		return numa_node_id();
-	case MPOL_BIND:
-		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
-	case MPOL_INTERLEAVE:
-		return interleave_nodes(pol);
-	case MPOL_PREFERRED:
-		return pol->v.preferred_node >= 0 ?
-				pol->v.preferred_node : numa_node_id();
-	}
-	BUG();
-	return 0;
-}
-
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-	case MPOL_DEFAULT:
-	case MPOL_INTERLEAVE:
-		return 1;
-	case MPOL_BIND: {
-		struct zone **z;
-		for (z = pol->v.zonelist->zones; *z; z++)
-			if ((*z)->zone_pgdat->node_id == nid)
-				return 1;
-		return 0;
-	}
-	default:
-		BUG();
-		return 0;
-	}
-}
-
-/*
  * Shared memory backing store policy support.
  *
  * Remember policies even when nobody has shared memory mapped.
@@ -1141,6 +1362,30 @@ restart:
 	return 0;
 }
 
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+				nodemask_t *policy_nodes)
+{
+	info->root = RB_ROOT;
+	spin_lock_init(&info->lock);
+
+	if (policy != MPOL_DEFAULT) {
+		struct mempolicy *newpol;
+
+		/* Falls back to MPOL_DEFAULT on any error */
+		newpol = mpol_new(policy, policy_nodes);
+		if (!IS_ERR(newpol)) {
+			/* Create pseudo-vma that contains just the policy */
+			struct vm_area_struct pvma;
+
+			memset(&pvma, 0, sizeof(struct vm_area_struct));
+			/* Policy covers entire file */
+			pvma.vm_end = TASK_SIZE;
+			mpol_set_shared_policy(info, &pvma, newpol);
+			mpol_free(newpol);
+		}
+	}
+}
+
 int mpol_set_shared_policy(struct shared_policy *info,
 			struct vm_area_struct *vma, struct mempolicy *npol)
 {
@@ -1209,25 +1454,31 @@ void numa_default_policy(void)
 }
 
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-							const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
+	nodemask_t *mpolmask;
 	nodemask_t tmp;
 
 	if (!pol)
 		return;
+	mpolmask = &pol->cpuset_mems_allowed;
+	if (nodes_equal(*mpolmask, *newmask))
+		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
-		current->il_next = node_remap(current->il_next, *old, *new);
+		*mpolmask = *newmask;
+		current->il_next = node_remap(current->il_next,
+						*mpolmask, *newmask);
 		break;
 	case MPOL_PREFERRED:
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-								*old, *new);
+						*mpolmask, *newmask);
+		*mpolmask = *newmask;
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
@@ -1237,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
 			node_set((*z)->zone_pgdat->node_id, nodes);
-		nodes_remap(tmp, nodes, *old, *new);
+		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
 		zonelist = bind_zonelist(&nodes);
@@ -1252,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 			kfree(pol->v.zonelist);
 			pol->v.zonelist = zonelist;
 		}
+		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1261,12 +1513,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+	mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
  *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		mpol_rebind_policy(vma->vm_policy, new);
+	up_write(&mm->mmap_sem);
+}
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+
+static const char *policy_types[] = { "default", "prefer", "bind",
+				      "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
  */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	int l;
+	nodemask_t nodes;
+	int mode = pol ? pol->policy : MPOL_DEFAULT;
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		nodes_clear(nodes);
+		break;
+
+	case MPOL_PREFERRED:
+		nodes_clear(nodes);
+		node_set(pol->v.preferred_node, nodes);
+		break;
+
+	case MPOL_BIND:
+		get_zonemask(pol, &nodes);
+		break;
+
+	case MPOL_INTERLEAVE:
+		nodes = pol->v.nodes;
+		break;
+
+	default:
+		BUG();
+		return -EFAULT;
+	}
+
+	l = strlen(policy_types[mode]);
+ 	if (buffer + maxlen < p + l + 1)
+ 		return -ENOSPC;
+
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}
+	return p - buffer;
+}
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long mapped;
+	unsigned long mapcount_max;
+	unsigned long node[MAX_NUMNODES];
+};
+
+static void gather_stats(struct page *page, void *private)
+{
+	struct numa_maps *md = private;
+	int count = page_mapcount(page);
+
+	if (count)
+		md->mapped++;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->pages++;
+
+	if (PageAnon(page))
+		md->anon++;
+
+	md->node[page_to_nid(page)]++;
+	cond_resched();
+}
+
+int show_numa_map(struct seq_file *m, void *v)
 {
-	rebind_policy(current->mempolicy, old, new);
+	struct task_struct *task = m->private;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md;
+	int n;
+	char buffer[50];
+
+	if (!vma->vm_mm)
+		return 0;
+
+	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+	if (!md)
+		return 0;
+
+	check_pgd_range(vma, vma->vm_start, vma->vm_end,
+		    &node_online_map, MPOL_MF_STATS, md);
+
+	if (md->pages) {
+		mpol_to_str(buffer, sizeof(buffer),
+			    get_vma_policy(task, vma, vma->vm_start));
+
+		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+			   vma->vm_start, buffer, md->pages,
+			   md->mapped, md->mapcount_max);
+
+		if (md->anon)
+			seq_printf(m," anon=%lu",md->anon);
+
+		for_each_online_node(n)
+			if (md->node[n])
+				seq_printf(m, " N%d=%lu", n, md->node[n]);
+
+		seq_putc(m, '\n');
+	}
+	kfree(md);
+
+	if (m->count < m->size)
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
 }
+
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff76..b90c59573ab 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
  *  (C) Copyright 2002 Christoph Hellwig
  */
 
+#include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 64ba4dbcb7d..47556d2b3e9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index ddaeee9a0b6..1903bdf65e4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
diff --git a/mm/msync.c b/mm/msync.c
index 1b5b6f662dc..3563a56e1a5 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
 			ret = filemap_fdatawrite(mapping);
 			if (file->f_op && file->f_op->fsync) {
 				/*
-				 * We don't take i_sem here because mmap_sem
+				 * We don't take i_mutex here because mmap_sem
 				 * is already held.
 				 */
 				err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876..c10262d6823 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
 {
 	return 0;
 }
+
+struct page *filemap_nopage(struct vm_area_struct *area,
+			unsigned long address, int *type)
+{
+	BUG();
+	return NULL;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b903595..14bd4ec7959 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,6 +274,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
 		show_mem();
 	}
 
+	cpuset_lock();
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
@@ -284,6 +285,7 @@ retry:
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
+		cpuset_unlock();
 		panic("Out of memory and no killable processes...\n");
 	}
 
@@ -293,12 +295,14 @@ retry:
 
  out:
 	read_unlock(&tasklist_lock);
+	cpuset_unlock();
 	if (mm)
 		mmput(mm);
 
 	/*
 	 * Give "p" a good chance of killing itself before we
-	 * retry to allocate memory.
+	 * retry to allocate memory unless "p" is current
 	 */
-	schedule_timeout_interruptible(1);
+	if (!test_thread_flag(TIF_MEMDIE))
+		schedule_timeout_interruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9e..945559fb63d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
 static long ratelimit_pages = 32;
 
 static long total_pages;	/* The total number of pages in the machine. */
-static int dirty_exceeded;	/* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
 
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
 		if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
 			break;
 
-		dirty_exceeded = 1;
+		if (!dirty_exceeded)
+			dirty_exceeded = 1;
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 		blk_congestion_wait(WRITE, HZ/10);
 	}
 
-	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
 		dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
@@ -550,11 +551,17 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+	int ret;
+
 	if (wbc->nr_to_write <= 0)
 		return 0;
+	wbc->for_writepages = 1;
 	if (mapping->a_ops->writepages)
-		return mapping->a_ops->writepages(mapping, wbc);
-	return generic_writepages(mapping, wbc);
+		ret =  mapping->a_ops->writepages(mapping, wbc);
+	else
+		ret = generic_writepages(mapping, wbc);
+	wbc->for_writepages = 0;
+	return ret;
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc..df54e2fc8ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
+
+static void fastcall free_hot_cold_page(struct page *page, int cold);
 
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
@@ -81,6 +85,7 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
+#ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
@@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
 	return 0;
 }
 
-static void bad_page(const char *function, struct page *page)
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
 {
-	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
-		function, current->comm, page);
-	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-		(int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
-		page->mapping, page_mapcount(page), page_count(page));
-	printk(KERN_EMERG "Backtrace:\n");
+	return 0;
+}
+#endif
+
+static void bad_page(struct page *page)
+{
+	printk(KERN_EMERG "Bad page state in process '%s'\n"
+		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+		KERN_EMERG "Backtrace:\n",
+		current->comm, page, (int)(2*sizeof(unsigned long)),
+		(unsigned long)page->flags, page->mapping,
+		page_mapcount(page), page_count(page));
 	dump_stack();
-	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
@@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	if (!PageCompound(page))
-		return;
-
-	if (page[1].index != order)
-		bad_page(__FUNCTION__, page);
+	if (unlikely(page[1].index != order))
+		bad_page(page);
 
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
-		if (!PageCompound(p))
-			bad_page(__FUNCTION__, page);
-		if (page_private(p) != (unsigned long)page)
-			bad_page(__FUNCTION__, page);
+		if (unlikely(!PageCompound(p) |
+				(page_private(p) != (unsigned long)page)))
+			bad_page(page);
 		ClearPageCompound(p);
 	}
 }
@@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
- * (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is free &&
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
  * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
 {
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 0;
+#endif
+
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
             page_count(page) == 0)
@@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
  * -- wli
  */
 
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order)
 {
 	unsigned long page_idx;
 	int order_size = 1 << order;
 
-	if (unlikely(order))
+	if (unlikely(PageCompound(page)))
 		destroy_compound_page(page, order);
 
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
 		struct free_area *area;
 		struct page *buddy;
 
-		combined_idx = __find_combined_index(page_idx, order);
 		buddy = __page_find_buddy(page, page_idx, order);
-
-		if (bad_range(zone, buddy))
-			break;
 		if (!page_is_buddy(buddy, order))
 			break;		/* Move the buddy up one level. */
+
 		list_del(&buddy->lru);
 		area = zone->free_area + order;
 		area->nr_free--;
 		rmv_page_order(buddy);
+		combined_idx = __find_combined_index(page_idx, order);
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
@@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
 	zone->free_area[order].nr_free++;
 }
 
-static inline int free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
 {
-	if (	page_mapcount(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private |
@@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page)
 			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved )))
-		bad_page(function, page);
+			1 << PG_reserved ))))
+		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
 	/*
@@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page)
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
-static int
-free_pages_bulk(struct zone *zone, int count,
-		struct list_head *list, unsigned int order)
+static void free_pages_bulk(struct zone *zone, int count,
+					struct list_head *list, int order)
 {
-	unsigned long flags;
-	struct page *page = NULL;
-	int ret = 0;
-
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
-	while (!list_empty(list) && count--) {
+	while (count--) {
+		struct page *page;
+
+		BUG_ON(list_empty(list));
 		page = list_entry(list->prev, struct page, lru);
-		/* have to delete it as __free_pages_bulk list manipulates */
+		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
-		__free_pages_bulk(page, zone, order);
-		ret++;
+		__free_one_page(page, zone, order);
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
-	return ret;
+	spin_unlock(&zone->lock);
 }
 
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
 {
 	LIST_HEAD(list);
+	list_add(&page->lru, &list);
+	free_pages_bulk(zone, 1, &list, order);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+	unsigned long flags;
 	int i;
 	int reserved = 0;
 
 	arch_free_page(page, order);
+	if (!PageHighMem(page))
+		mutex_debug_check_no_locks_freed(page_address(page),
+						 PAGE_SIZE<<order);
 
 #ifndef CONFIG_MMU
-	if (order > 0)
-		for (i = 1 ; i < (1 << order) ; ++i)
-			__put_page(page + i);
+	for (i = 1 ; i < (1 << order) ; ++i)
+		__put_page(page + i);
 #endif
 
 	for (i = 0 ; i < (1 << order) ; ++i)
-		reserved += free_pages_check(__FUNCTION__, page + i);
+		reserved += free_pages_check(page + i);
 	if (reserved)
 		return;
 
-	list_add(&page->lru, &list);
-	mod_page_state(pgfree, 1 << order);
-	kernel_map_pages(page, 1<<order, 0);
-	free_pages_bulk(page_zone(page), 1, &list, order);
+	kernel_map_pages(page, 1 << order, 0);
+	local_irq_save(flags);
+	__mod_page_state(pgfree, 1 << order);
+	free_one_page(page_zone(page), page, order);
+	local_irq_restore(flags);
+}
+
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+	if (order == 0) {
+		__ClearPageReserved(page);
+		set_page_count(page, 0);
+
+		free_hot_cold_page(page, 0);
+	} else {
+		LIST_HEAD(list);
+		int loop;
+
+		for (loop = 0; loop < BITS_PER_LONG; loop++) {
+			struct page *p = &page[loop];
+
+			if (loop + 16 < BITS_PER_LONG)
+				prefetchw(p + 16);
+			__ClearPageReserved(p);
+			set_page_count(p, 0);
+		}
+
+		arch_free_page(page, order);
+
+		mod_page_state(pgfree, 1 << order);
+
+		list_add(&page->lru, &list);
+		kernel_map_pages(page, 1 << order, 0);
+		free_pages_bulk(page_zone(page), 1, &list, order);
+	}
 }
 
 
@@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
  *
  * -- wli
  */
-static inline struct page *
-expand(struct zone *zone, struct page *page,
+static inline void expand(struct zone *zone, struct page *page,
  	int low, int high, struct free_area *area)
 {
 	unsigned long size = 1 << high;
@@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page,
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
-	return page;
-}
-
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 
 /*
@@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order)
  */
 static int prep_new_page(struct page *page, int order)
 {
-	if (	page_mapcount(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private	|
@@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved )))
-		bad_page(__FUNCTION__, page);
+			1 << PG_reserved ))))
+		bad_page(page);
 
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 		rmv_page_order(page);
 		area->nr_free--;
 		zone->free_pages -= 1UL << order;
-		return expand(zone, page, order, current_order, area);
+		expand(zone, page, order, current_order, area);
+		return page;
 	}
 
 	return NULL;
@@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
 			unsigned long count, struct list_head *list)
 {
-	unsigned long flags;
 	int i;
-	int allocated = 0;
-	struct page *page;
 	
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
-		page = __rmqueue(zone, order);
-		if (page == NULL)
+		struct page *page = __rmqueue(zone, order);
+		if (unlikely(page == NULL))
 			break;
-		allocated++;
 		list_add_tail(&page->lru, list);
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
-	return allocated;
+	spin_unlock(&zone->lock);
+	return i;
 }
 
 #ifdef CONFIG_NUMA
@@ -572,14 +601,13 @@ void drain_remote_pages(void)
 		if (zone->zone_pgdat->node_id == numa_node_id())
 			continue;
 
-		pset = zone->pageset[smp_processor_id()];
+		pset = zone_pcp(zone, smp_processor_id());
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
-			if (pcp->count)
-				pcp->count -= free_pages_bulk(zone, pcp->count,
-						&pcp->list, 0);
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
 		}
 	}
 	local_irq_restore(flags);
@@ -589,6 +617,7 @@ void drain_remote_pages(void)
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
+	unsigned long flags;
 	struct zone *zone;
 	int i;
 
@@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
-			pcp->count -= free_pages_bulk(zone, pcp->count,
-						&pcp->list, 0);
+			local_irq_save(flags);
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
+			local_irq_restore(flags);
 		}
 	}
 }
@@ -647,18 +678,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-	unsigned long flags;
-	int cpu;
 	pg_data_t *pg = z->zone_pgdat;
 	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
 	struct per_cpu_pageset *p;
 
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	p = zone_pcp(z,cpu);
+	p = zone_pcp(z, cpu);
 	if (pg == orig) {
 		p->numa_hit++;
 	} else {
@@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 		p->local_node++;
 	else
 		p->other_node++;
-	local_irq_restore(flags);
 #endif
 }
 
 /*
  * Free a 0-order page
  */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
@@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 
 	if (PageAnon(page))
 		page->mapping = NULL;
-	if (free_pages_check(__FUNCTION__, page))
+	if (free_pages_check(page))
 		return;
 
-	inc_page_state(pgfree);
 	kernel_map_pages(page, 1, 0);
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
+	__inc_page_state(pgfree);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
-	if (pcp->count >= pcp->high)
-		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+	if (pcp->count >= pcp->high) {
+		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		pcp->count -= pcp->batch;
+	}
 	local_irq_restore(flags);
 	put_cpu();
 }
@@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+			struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
+	int cpu;
 
 again:
-	if (order == 0) {
+	cpu  = get_cpu();
+	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 
-		page = NULL;
-		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
-		if (pcp->count <= pcp->low)
+		if (!pcp->count) {
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
-		if (pcp->count) {
-			page = list_entry(pcp->list.next, struct page, lru);
-			list_del(&page->lru);
-			pcp->count--;
+			if (unlikely(!pcp->count))
+				goto failed;
 		}
-		local_irq_restore(flags);
-		put_cpu();
+		page = list_entry(pcp->list.next, struct page, lru);
+		list_del(&page->lru);
+		pcp->count--;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
-		spin_unlock_irqrestore(&zone->lock, flags);
+		spin_unlock(&zone->lock);
+		if (!page)
+			goto failed;
 	}
 
-	if (page != NULL) {
-		BUG_ON(bad_range(zone, page));
-		mod_page_state_zone(zone, pgalloc, 1 << order);
-		if (prep_new_page(page, order))
-			goto again;
+	__mod_page_state_zone(zone, pgalloc, 1 << order);
+	zone_statistics(zonelist, zone, cpu);
+	local_irq_restore(flags);
+	put_cpu();
+
+	BUG_ON(bad_range(zone, page));
+	if (prep_new_page(page, order))
+		goto again;
 
-		if (gfp_flags & __GFP_ZERO)
-			prep_zero_page(page, order, gfp_flags);
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
 
-		if (order && (gfp_flags & __GFP_COMP))
-			prep_compound_page(page, order);
-	}
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
 	return page;
+
+failed:
+	local_irq_restore(flags);
+	put_cpu();
+	return NULL;
 }
 
 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
@@ -842,12 +878,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				mark = (*z)->pages_high;
 			if (!zone_watermark_ok(*z, order, mark,
 				    classzone_idx, alloc_flags))
-				continue;
+				if (!zone_reclaim_mode ||
+				    !zone_reclaim(*z, gfp_mask, order))
+					continue;
 		}
 
-		page = buffered_rmqueue(*z, order, gfp_mask);
+		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
 		if (page) {
-			zone_statistics(zonelist, *z);
 			break;
 		}
 	} while (*(++z) != NULL);
@@ -896,15 +933,15 @@ restart:
 	 *
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
-	 * policy.
+	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags = ALLOC_WMARK_MIN;
 	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
 		alloc_flags |= ALLOC_HIGH;
-	if (wait)
-		alloc_flags |= ALLOC_CPUSET;
+	alloc_flags |= ALLOC_CPUSET;
 
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +963,7 @@ restart:
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
 			page = get_page_from_freelist(gfp_mask, order,
-				zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+				zonelist, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
@@ -945,6 +982,7 @@ rebalance:
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
@@ -1171,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
 
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
@@ -1224,7 +1262,7 @@ void get_full_page_state(struct page_state *ret)
 	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
 	unsigned long ret = 0;
 	int cpu;
@@ -1238,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset)
 	return ret;
 }
 
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	void *ptr;
+
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
 	unsigned long flags;
-	void* ptr;
+	void *ptr;
 
 	local_irq_save(flags);
 	ptr = &__get_cpu_var(page_states);
-	*(unsigned long*)(ptr + offset) += delta;
+	*(unsigned long *)(ptr + offset) += delta;
 	local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL(__mod_page_state);
+EXPORT_SYMBOL(mod_page_state_offset);
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1381,7 @@ void show_free_areas(void)
 		show_node(zone);
 		printk("%s per-cpu:", zone->name);
 
-		if (!zone->present_pages) {
+		if (!populated_zone(zone)) {
 			printk(" empty\n");
 			continue;
 		} else
@@ -1347,10 +1393,9 @@ void show_free_areas(void)
 			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)
-				printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+				printk("cpu %d %s: high %d, batch %d used:%d\n",
 					cpu,
 					temperature ? "cold" : "hot",
-					pageset->pcp[temperature].low,
 					pageset->pcp[temperature].high,
 					pageset->pcp[temperature].batch,
 					pageset->pcp[temperature].count);
@@ -1413,7 +1458,7 @@ void show_free_areas(void)
 
 		show_node(zone);
 		printk("%s: ", zone->name);
-		if (!zone->present_pages) {
+		if (!populated_zone(zone)) {
 			printk("empty\n");
 			continue;
 		}
@@ -1433,36 +1478,29 @@ void show_free_areas(void)
 
 /*
  * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
  */
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
-{
-	switch (k) {
-		struct zone *zone;
-	default:
-		BUG();
-	case ZONE_HIGHMEM:
-		zone = pgdat->node_zones + ZONE_HIGHMEM;
-		if (zone->present_pages) {
+static int __init build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-			BUG();
+			BUG_ON(zone_type > ZONE_NORMAL);
 #endif
-			zonelist->zones[j++] = zone;
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
 		}
-	case ZONE_NORMAL:
-		zone = pgdat->node_zones + ZONE_NORMAL;
-		if (zone->present_pages)
-			zonelist->zones[j++] = zone;
-	case ZONE_DMA32:
-		zone = pgdat->node_zones + ZONE_DMA32;
-		if (zone->present_pages)
-			zonelist->zones[j++] = zone;
-	case ZONE_DMA:
-		zone = pgdat->node_zones + ZONE_DMA;
-		if (zone->present_pages)
-			zonelist->zones[j++] = zone;
-	}
+		zone_type--;
 
-	return j;
+	} while (zone_type >= 0);
+	return nr_zones;
 }
 
 static inline int highest_zone(int zone_bits)
@@ -1559,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (node_distance(local_node, node) !=
-				node_distance(local_node, prev_node))
+
+		if (distance != node_distance(local_node, prev_node))
 			node_load[node] += load;
 		prev_node = node;
 		load--;
@@ -1699,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
-void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 
-	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!early_pfn_valid(pfn))
 			continue;
-		if (!early_pfn_in_nid(pfn, nid))
-			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 1);
@@ -1754,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
-static int __devinit zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
 	int batch;
 
@@ -1794,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 
 	pcp = &p->pcp[0];		/* hot */
 	pcp->count = 0;
-	pcp->low = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 
 	pcp = &p->pcp[1];		/* cold*/
 	pcp->count = 0;
-	pcp->low = 0;
 	pcp->high = 2 * batch;
 	pcp->batch = max(1UL, batch/2);
 	INIT_LIST_HEAD(&pcp->list);
 }
 
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -1832,18 +1893,22 @@ static struct per_cpu_pageset
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
-static int __devinit process_zones(int cpu)
+static int __meminit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 
 	for_each_zone(zone) {
 
-		zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!zone->pageset[cpu])
+		if (!zone_pcp(zone, cpu))
 			goto bad;
 
-		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
 
 	return 0;
@@ -1851,15 +1916,14 @@ bad:
 	for_each_zone(dzone) {
 		if (dzone == zone)
 			break;
-		kfree(dzone->pageset[cpu]);
-		dzone->pageset[cpu] = NULL;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
 	}
 	return -ENOMEM;
 }
 
 static inline void free_zone_pagesets(int cpu)
 {
-#ifdef CONFIG_NUMA
 	struct zone *zone;
 
 	for_each_zone(zone) {
@@ -1868,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu)
 		zone_pcp(zone, cpu) = NULL;
 		kfree(pset);
 	}
-#endif
 }
 
-static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
@@ -1911,7 +1974,7 @@ void __init setup_per_cpu_pageset(void)
 
 #endif
 
-static __devinit
+static __meminit
 void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
@@ -1931,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 		init_waitqueue_head(zone->wait_table + i);
 }
 
-static __devinit void zone_pcp_init(struct zone *zone)
+static __meminit void zone_pcp_init(struct zone *zone)
 {
 	int cpu;
 	unsigned long batch = zone_batchsize(zone);
@@ -1939,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
-		zone->pageset[cpu] = &boot_pageset[cpu];
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
 		setup_pageset(&boot_pageset[cpu],0);
 #else
 		setup_pageset(zone_pcp(zone,cpu), batch);
@@ -1949,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
 		zone->name, zone->present_pages, batch);
 }
 
-static __devinit void init_currently_empty_zone(struct zone *zone,
+static __meminit void init_currently_empty_zone(struct zone *zone,
 		unsigned long zone_start_pfn, unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
@@ -2116,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg)
 	int order;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!zone->present_pages)
+		if (!populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
 		int i;
 
-		if (!zone->present_pages)
+		if (!populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
@@ -2182,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 		seq_printf(m,
 			   ")"
 			   "\n  pagesets");
-		for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+		for_each_online_cpu(i) {
 			struct per_cpu_pageset *pageset;
 			int j;
 
@@ -2197,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 				seq_printf(m,
 					   "\n    cpu: %i pcp: %i"
 					   "\n              count: %i"
-					   "\n              low:   %i"
 					   "\n              high:  %i"
 					   "\n              batch: %i",
 					   i, j,
 					   pageset->pcp[j].count,
-					   pageset->pcp[j].low,
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
@@ -2257,32 +2318,40 @@ static char *vmstat_text[] = {
 	"pgpgout",
 	"pswpin",
 	"pswpout",
-	"pgalloc_high",
 
+	"pgalloc_high",
 	"pgalloc_normal",
+	"pgalloc_dma32",
 	"pgalloc_dma",
+
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
 
 	"pgfault",
 	"pgmajfault",
+
 	"pgrefill_high",
 	"pgrefill_normal",
+	"pgrefill_dma32",
 	"pgrefill_dma",
 
 	"pgsteal_high",
 	"pgsteal_normal",
+	"pgsteal_dma32",
 	"pgsteal_dma",
+
 	"pgscan_kswapd_high",
 	"pgscan_kswapd_normal",
-
+	"pgscan_kswapd_dma32",
 	"pgscan_kswapd_dma",
+
 	"pgscan_direct_high",
 	"pgscan_direct_normal",
+	"pgscan_direct_dma32",
 	"pgscan_direct_dma",
-	"pginodesteal",
 
+	"pginodesteal",
 	"slabs_scanned",
 	"kswapd_steal",
 	"kswapd_inodesteal",
@@ -2539,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c48..c4b6d0afd73 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 
 static int __pdflush(struct pdflush_work *my_work)
 {
-	current->flags |= PF_FLUSHER;
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	my_work->fn = NULL;
 	my_work->who = current;
 	INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c..8d6eeaaa629 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
 	unsigned page_idx;
 	struct pagevec lru_pvec;
-	int ret = 0;
+	int ret;
 
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		list_del(&page->lru);
 		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
-			mapping->a_ops->readpage(filp, page);
-			if (!pagevec_add(&lru_pvec, page))
-				__pagevec_lru_add(&lru_pvec);
-		} else {
-			page_cache_release(page);
+			ret = mapping->a_ops->readpage(filp, page);
+			if (ret != AOP_TRUNCATED_PAGE) {
+				if (!pagevec_add(&lru_pvec, page))
+					__pagevec_lru_add(&lru_pvec);
+				continue;
+			} /* else fall through to release */
 		}
+		page_cache_release(page);
 	}
 	pagevec_lru_add(&lru_pvec);
+	ret = 0;
 out:
 	return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def15..d85a99d28c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,20 +20,20 @@
 /*
  * Lock ordering in mm:
  *
- * inode->i_sem	(while writing or truncating, not reading or faulting)
+ * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   inode->i_alloc_sem
  *
  * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
- * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
- * taken together; in truncation, i_sem is taken outermost.
+ * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
+ * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
+ * taken together; in truncation, i_mutex is taken outermost.
  *
  * mm->mmap_sem
  *   page->flags PG_locked (lock_page)
  *     mapping->i_mmap_lock
  *       anon_vma->lock
  *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
  *             mapping->private_lock (in __set_page_dirty_buffers)
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
 }
 
 /**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	BUG_ON(!anon_vma);
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	page->mapping = (struct address_space *) anon_vma;
+
+	page->index = linear_page_index(vma, address);
+
+	/*
+	 * nr_mapped state can be updated without turning off
+	 * interrupts because it is not modified via interrupt.
+	 */
+	__inc_page_state(nr_mapped);
+}
+
+/**
  * page_add_anon_rmap - add pte mapping to an anonymous page
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	if (atomic_inc_and_test(&page->_mapcount)) {
-		struct anon_vma *anon_vma = vma->anon_vma;
-
-		BUG_ON(!anon_vma);
-		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-		page->mapping = (struct address_space *) anon_vma;
-
-		page->index = linear_page_index(vma, address);
-
-		inc_page_state(nr_mapped);
-	}
+	if (atomic_inc_and_test(&page->_mapcount))
+		__page_set_anon_rmap(page, vma, address);
 	/* else checking page index and mapping is racy */
 }
 
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+	__page_set_anon_rmap(page, vma, address);
+}
+
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
 	BUG_ON(!pfn_valid(page_to_pfn(page)));
 
 	if (atomic_inc_and_test(&page->_mapcount))
-		inc_page_state(nr_mapped);
+		__inc_page_state(nr_mapped);
 }
 
 /**
@@ -483,6 +514,13 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
+		if (page_mapcount(page) < 0) {
+			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
+			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
+			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+		}
+
 		BUG_ON(page_mapcount(page) < 0);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
@@ -495,7 +533,7 @@ void page_remove_rmap(struct page *page)
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		dec_page_state(nr_mapped);
+		__dec_page_state(nr_mapped);
 	}
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e..ce501bce1c2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
 	} while (next);
 }
 
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
 	long nr_swaps_freed = 0;
 	int offset;
 	int freed;
+	int punch_hole = 0;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (idx >= info->next_index)
 		return;
 
 	spin_lock(&info->lock);
 	info->flags |= SHMEM_TRUNCATE;
-	limit = info->next_index;
-	info->next_index = idx;
+	if (likely(end == (loff_t) -1)) {
+		limit = info->next_index;
+		info->next_index = idx;
+	} else {
+		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		if (limit > info->next_index)
+			limit = info->next_index;
+		punch_hole = 1;
+	}
+
 	topdir = info->i_indirect;
-	if (topdir && idx <= SHMEM_NR_DIRECT) {
+	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 		info->i_indirect = NULL;
 		nr_pages_to_free++;
 		list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
 			set_page_private(subdir, page_private(subdir) - freed);
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(page_private(subdir) > offset);
+			if (!punch_hole)
+				BUG_ON(page_private(subdir) > offset);
 		}
 		if (offset)
 			offset = 0;
-		else if (subdir) {
+		else if (subdir && !page_private(subdir)) {
 			dir[diroff] = NULL;
 			nr_pages_to_free++;
 			list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
 		 * Also, though shmem_getpage checks i_size before adding to
 		 * cache, no recheck after: so fix the narrow window there too.
 		 */
-		truncate_inode_pages(inode->i_mapping, inode->i_size);
+		truncate_inode_pages_range(inode->i_mapping, start, end);
 	}
 
 	spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
 	}
 }
 
+static void shmem_truncate(struct inode *inode)
+{
+	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
 	swap_free(swap);
 redirty:
 	set_page_dirty(page);
-	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
+	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
 }
 
 #ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
 	return retval;
 }
 
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
@@ -1301,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 		case S_IFREG:
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			mpol_shared_policy_init(&info->policy);
+			mpol_shared_policy_init(&info->policy, sbinfo->policy,
+							&sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
 			inode->i_nlink++;
@@ -1315,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy);
+			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+						NULL);
 			break;
 		}
 	} else if (sbinfo->max_inodes) {
@@ -1355,7 +1372,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	pos = *ppos;
 	written = 0;
@@ -1440,7 +1457,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	if (written)
 		err = written;
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 
@@ -1476,7 +1493,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
-		 * are called without i_sem protection against truncate
+		 * are called without i_mutex protection against truncate
 		 */
 		nr = PAGE_CACHE_SIZE;
 		i_size = i_size_read(inode);
@@ -1828,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.put_link	= shmem_put_link,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
+	int *policy, nodemask_t *policy_nodes)
 {
 	char *this_char, *value, *rest;
 
@@ -1882,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
 			*gid = simple_strtoul(value,&rest,0);
 			if (*rest)
 				goto bad_val;
+		} else if (!strcmp(this_char,"mpol")) {
+			if (!strcmp(value,"default"))
+				*policy = MPOL_DEFAULT;
+			else if (!strcmp(value,"preferred"))
+				*policy = MPOL_PREFERRED;
+			else if (!strcmp(value,"bind"))
+				*policy = MPOL_BIND;
+			else if (!strcmp(value,"interleave"))
+				*policy = MPOL_INTERLEAVE;
+			else
+				goto bad_val;
+		} else if (!strcmp(this_char,"mpol_nodelist")) {
+			nodelist_parse(value, *policy_nodes);
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
 			       this_char);
@@ -1902,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	unsigned long max_blocks = sbinfo->max_blocks;
 	unsigned long max_inodes = sbinfo->max_inodes;
+	int policy = sbinfo->policy;
+	nodemask_t policy_nodes = sbinfo->policy_nodes;
 	unsigned long blocks;
 	unsigned long inodes;
 	int error = -EINVAL;
 
-	if (shmem_parse_options(data, NULL, NULL, NULL,
-				&max_blocks, &max_inodes))
+	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
+				&max_inodes, &policy, &policy_nodes))
 		return error;
 
 	spin_lock(&sbinfo->stat_lock);
@@ -1933,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->free_blocks = max_blocks - blocks;
 	sbinfo->max_inodes  = max_inodes;
 	sbinfo->free_inodes = max_inodes - inodes;
+	sbinfo->policy = policy;
+	sbinfo->policy_nodes = policy_nodes;
 out:
 	spin_unlock(&sbinfo->stat_lock);
 	return error;
@@ -1957,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
 	struct shmem_sb_info *sbinfo;
 	unsigned long blocks = 0;
 	unsigned long inodes = 0;
+	int policy = MPOL_DEFAULT;
+	nodemask_t policy_nodes = node_online_map;
 
 #ifdef CONFIG_TMPFS
 	/*
@@ -1969,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
 		inodes = totalram_pages - totalhigh_pages;
 		if (inodes > blocks)
 			inodes = blocks;
-		if (shmem_parse_options(data, &mode, &uid, &gid,
-					&blocks, &inodes))
+		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
+					&inodes, &policy, &policy_nodes))
 			return -EINVAL;
 	}
 #else
@@ -1988,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->free_blocks = blocks;
 	sbinfo->max_inodes = inodes;
 	sbinfo->free_inodes = inodes;
+	sbinfo->policy = policy;
+	sbinfo->policy_nodes = policy_nodes;
 
 	sb->s_fs_info = sbinfo;
 	sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2083,6 +2123,7 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c46..6f8495e2185 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
  * Further notes from the original documentation:
  *
  * 11 April '97.  Started multi-threading - markhe
- *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
  *	The sem is only needed when accessing/extending the cache-chain, which
  *	can never happen inside an interrupt (kmem_cache_create(),
  *	kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/nodemask.h>
+#include	<linux/mempolicy.h>
+#include	<linux/mutex.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -130,7 +132,6 @@
 #define	FORCED_DEBUG	0
 #endif
 
-
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 
@@ -217,12 +218,12 @@ static unsigned long offslab_limit;
  * Slabs are chained into three list: fully used, partial, fully free slabs.
  */
 struct slab {
-	struct list_head	list;
-	unsigned long		colouroff;
-	void			*s_mem;		/* including colour offset */
-	unsigned int		inuse;		/* num of objs active in slab */
-	kmem_bufctl_t		free;
-	unsigned short          nodeid;
+	struct list_head list;
+	unsigned long colouroff;
+	void *s_mem;		/* including colour offset */
+	unsigned int inuse;	/* num of objs active in slab */
+	kmem_bufctl_t free;
+	unsigned short nodeid;
 };
 
 /*
@@ -242,9 +243,9 @@ struct slab {
  * We assume struct slab_rcu can overlay struct slab when destroying.
  */
 struct slab_rcu {
-	struct rcu_head		head;
-	kmem_cache_t		*cachep;
-	void			*addr;
+	struct rcu_head head;
+	kmem_cache_t *cachep;
+	void *addr;
 };
 
 /*
@@ -279,23 +280,23 @@ struct array_cache {
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
 	struct array_cache cache;
-	void * entries[BOOT_CPUCACHE_ENTRIES];
+	void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 
 /*
  * The slab lists for all objects.
  */
 struct kmem_list3 {
-	struct list_head	slabs_partial;	/* partial list first, better asm code */
-	struct list_head	slabs_full;
-	struct list_head	slabs_free;
-	unsigned long	free_objects;
-	unsigned long	next_reap;
-	int		free_touched;
-	unsigned int 	free_limit;
-	spinlock_t      list_lock;
-	struct array_cache	*shared;	/* shared per node */
-	struct array_cache	**alien;	/* on other nodes */
+	struct list_head slabs_partial;	/* partial list first, better asm code */
+	struct list_head slabs_full;
+	struct list_head slabs_free;
+	unsigned long free_objects;
+	unsigned long next_reap;
+	int free_touched;
+	unsigned int free_limit;
+	spinlock_t list_lock;
+	struct array_cache *shared;	/* shared per node */
+	struct array_cache **alien;	/* on other nodes */
 };
 
 /*
@@ -367,63 +368,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
  *
  * manages a cache.
  */
-	
+
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache	*array[NR_CPUS];
-	unsigned int		batchcount;
-	unsigned int		limit;
-	unsigned int 		shared;
-	unsigned int		objsize;
+	struct array_cache *array[NR_CPUS];
+	unsigned int batchcount;
+	unsigned int limit;
+	unsigned int shared;
+	unsigned int objsize;
 /* 2) touched by every alloc & free from the backend */
-	struct kmem_list3	*nodelists[MAX_NUMNODES];
-	unsigned int	 	flags;	/* constant flags */
-	unsigned int		num;	/* # of objs per slab */
-	spinlock_t		spinlock;
+	struct kmem_list3 *nodelists[MAX_NUMNODES];
+	unsigned int flags;	/* constant flags */
+	unsigned int num;	/* # of objs per slab */
+	spinlock_t spinlock;
 
 /* 3) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
-	unsigned int		gfporder;
+	unsigned int gfporder;
 
 	/* force GFP flags, e.g. GFP_DMA */
-	gfp_t			gfpflags;
+	gfp_t gfpflags;
 
-	size_t			colour;		/* cache colouring range */
-	unsigned int		colour_off;	/* colour offset */
-	unsigned int		colour_next;	/* cache colouring */
-	kmem_cache_t		*slabp_cache;
-	unsigned int		slab_size;
-	unsigned int		dflags;		/* dynamic flags */
+	size_t colour;		/* cache colouring range */
+	unsigned int colour_off;	/* colour offset */
+	unsigned int colour_next;	/* cache colouring */
+	kmem_cache_t *slabp_cache;
+	unsigned int slab_size;
+	unsigned int dflags;	/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(void *, kmem_cache_t *, unsigned long);
+	void (*ctor) (void *, kmem_cache_t *, unsigned long);
 
 	/* de-constructor func */
-	void (*dtor)(void *, kmem_cache_t *, unsigned long);
+	void (*dtor) (void *, kmem_cache_t *, unsigned long);
 
 /* 4) cache creation/removal */
-	const char		*name;
-	struct list_head	next;
+	const char *name;
+	struct list_head next;
 
 /* 5) statistics */
 #if STATS
-	unsigned long		num_active;
-	unsigned long		num_allocations;
-	unsigned long		high_mark;
-	unsigned long		grown;
-	unsigned long		reaped;
-	unsigned long 		errors;
-	unsigned long		max_freeable;
-	unsigned long		node_allocs;
-	unsigned long		node_frees;
-	atomic_t		allochit;
-	atomic_t		allocmiss;
-	atomic_t		freehit;
-	atomic_t		freemiss;
+	unsigned long num_active;
+	unsigned long num_allocations;
+	unsigned long high_mark;
+	unsigned long grown;
+	unsigned long reaped;
+	unsigned long errors;
+	unsigned long max_freeable;
+	unsigned long node_allocs;
+	unsigned long node_frees;
+	atomic_t allochit;
+	atomic_t allocmiss;
+	atomic_t freehit;
+	atomic_t freemiss;
 #endif
 #if DEBUG
-	int			dbghead;
-	int			reallen;
+	int dbghead;
+	int reallen;
 #endif
 };
 
@@ -523,14 +524,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
-		return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
-	return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+		return (unsigned long *)(objp + cachep->objsize -
+					 2 * BYTES_PER_WORD);
+	return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-	return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+	return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 
 #else
@@ -607,31 +609,31 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
-	{ NULL, }
+	{NULL,}
 #undef CACHE
 };
 
 static struct arraycache_init initarray_cache __initdata =
-	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
-	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-	.batchcount	= 1,
-	.limit		= BOOT_CPUCACHE_ENTRIES,
-	.shared		= 1,
-	.objsize	= sizeof(kmem_cache_t),
-	.flags		= SLAB_NO_REAP,
-	.spinlock	= SPIN_LOCK_UNLOCKED,
-	.name		= "kmem_cache",
+	.batchcount = 1,
+	.limit = BOOT_CPUCACHE_ENTRIES,
+	.shared = 1,
+	.objsize = sizeof(kmem_cache_t),
+	.flags = SLAB_NO_REAP,
+	.spinlock = SPIN_LOCK_UNLOCKED,
+	.name = "kmem_cache",
 #if DEBUG
-	.reallen	= sizeof(kmem_cache_t),
+	.reallen = sizeof(kmem_cache_t),
 #endif
 };
 
 /* Guard access to the cache-chain. */
-static struct semaphore	cache_chain_sem;
+static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 
 /*
@@ -655,9 +657,9 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
-static void cache_reap (void *unused);
+static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
+static void enable_cpucache(kmem_cache_t *cachep);
+static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +673,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 
 #if DEBUG
 	/* This happens if someone tries to call
- 	* kmem_cache_create(), or __kmalloc(), before
- 	* the generic caches are initialized.
- 	*/
+	 * kmem_cache_create(), or __kmalloc(), before
+	 * the generic caches are initialized.
+	 */
 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
 	while (size > csizep->cs_size)
@@ -697,10 +699,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
 
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-		 int flags, size_t *left_over, unsigned int *num)
+			   int flags, size_t *left_over, unsigned int *num)
 {
 	int i;
-	size_t wastage = PAGE_SIZE<<gfporder;
+	size_t wastage = PAGE_SIZE << gfporder;
 	size_t extra = 0;
 	size_t base = 0;
 
@@ -709,7 +711,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 		extra = sizeof(kmem_bufctl_t);
 	}
 	i = 0;
-	while (i*size + ALIGN(base+i*extra, align) <= wastage)
+	while (i * size + ALIGN(base + i * extra, align) <= wastage)
 		i++;
 	if (i > 0)
 		i--;
@@ -718,8 +720,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 		i = SLAB_LIMIT;
 
 	*num = i;
-	wastage -= i*size;
-	wastage -= ALIGN(base+i*extra, align);
+	wastage -= i * size;
+	wastage -= ALIGN(base + i * extra, align);
 	*left_over = wastage;
 }
 
@@ -728,7 +730,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
-		function, cachep->name, msg);
+	       function, cachep->name, msg);
 	dump_stack();
 }
 
@@ -755,9 +757,9 @@ static void __devinit start_cpu_timer(int cpu)
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-						int batchcount)
+					    int batchcount)
 {
-	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -772,10 +774,12 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 
 #ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
-	int memsize = sizeof(void*)*MAX_NUMNODES;
+	int memsize = sizeof(void *) * MAX_NUMNODES;
 	int i;
 
 	if (limit > 1)
@@ -789,7 +793,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
 			}
 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
 			if (!ac_ptr[i]) {
-				for (i--; i <=0; i--)
+				for (i--; i <= 0; i--)
 					kfree(ac_ptr[i]);
 				kfree(ac_ptr);
 				return NULL;
@@ -807,12 +811,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 		return;
 
 	for_each_node(i)
-		kfree(ac_ptr[i]);
+	    kfree(ac_ptr[i]);
 
 	kfree(ac_ptr);
 }
 
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static inline void __drain_alien_cache(kmem_cache_t *cachep,
+				       struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
@@ -826,7 +831,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
 
 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 {
-	int i=0;
+	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
 
@@ -846,18 +851,17 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
-				  unsigned long action, void *hcpu)
+				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-	kmem_cache_t* cachep;
+	kmem_cache_t *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
-	struct array_cache *nc = NULL;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		down(&cache_chain_sem);
+		mutex_lock(&cache_chain_mutex);
 		/* we need to do this right in the beginning since
 		 * alloc_arraycache's are going to use this list.
 		 * kmalloc_node allows us to add the slab to the right
@@ -871,27 +875,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			 */
 			if (!cachep->nodelists[node]) {
 				if (!(l3 = kmalloc_node(memsize,
-						GFP_KERNEL, node)))
+							GFP_KERNEL, node)))
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-				  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
 				cachep->nodelists[node] = l3;
 			}
 
 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
 			cachep->nodelists[node]->free_limit =
-				(1 + nr_cpus_node(node)) *
-				cachep->batchcount + cachep->num;
+			    (1 + nr_cpus_node(node)) *
+			    cachep->batchcount + cachep->num;
 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 		}
 
 		/* Now we can go ahead with allocating the shared array's
-		  & array cache's */
+		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
+			struct array_cache *nc;
+
 			nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+					      cachep->batchcount);
 			if (!nc)
 				goto bad;
 			cachep->array[cpu] = nc;
@@ -900,16 +906,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			BUG_ON(!l3);
 			if (!l3->shared) {
 				if (!(nc = alloc_arraycache(node,
-					cachep->shared*cachep->batchcount,
-					0xbaadf00d)))
-					goto  bad;
+							    cachep->shared *
+							    cachep->batchcount,
+							    0xbaadf00d)))
+					goto bad;
 
 				/* we are serialised from CPU_DEAD or
-				  CPU_UP_CANCELLED by the cpucontrol lock */
+				   CPU_UP_CANCELLED by the cpucontrol lock */
 				l3->shared = nc;
 			}
 		}
-		up(&cache_chain_sem);
+		mutex_unlock(&cache_chain_mutex);
 		break;
 	case CPU_ONLINE:
 		start_cpu_timer(cpu);
@@ -918,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 	case CPU_DEAD:
 		/* fall thru */
 	case CPU_UP_CANCELED:
-		down(&cache_chain_sem);
+		mutex_lock(&cache_chain_mutex);
 
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
@@ -942,13 +949,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
-                        }
+				spin_unlock(&l3->list_lock);
+				goto unlock_cache;
+			}
 
 			if (l3->shared) {
 				free_block(cachep, l3->shared->entry,
-						l3->shared->avail, node);
+					   l3->shared->avail, node);
 				kfree(l3->shared);
 				l3->shared = NULL;
 			}
@@ -966,17 +973,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			} else {
 				spin_unlock(&l3->list_lock);
 			}
-unlock_cache:
+		      unlock_cache:
 			spin_unlock_irq(&cachep->spinlock);
 			kfree(nc);
 		}
-		up(&cache_chain_sem);
+		mutex_unlock(&cache_chain_mutex);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
-bad:
-	up(&cache_chain_sem);
+      bad:
+	mutex_unlock(&cache_chain_mutex);
 	return NOTIFY_BAD;
 }
 
@@ -985,8 +992,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
-		int nodeid)
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1045,7 +1051,6 @@ void __init kmem_cache_init(void)
 	 */
 
 	/* 1) create the cache_cache */
-	init_MUTEX(&cache_chain_sem);
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
@@ -1055,14 +1060,14 @@ void __init kmem_cache_init(void)
 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 
 	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
-				&left_over, &cache_cache.num);
+		       &left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
 
-	cache_cache.colour = left_over/cache_cache.colour_off;
+	cache_cache.colour = left_over / cache_cache.colour_off;
 	cache_cache.colour_next = 0;
-	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
-				sizeof(struct slab), cache_line_size());
+	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+				      sizeof(struct slab), cache_line_size());
 
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
@@ -1074,14 +1079,18 @@ void __init kmem_cache_init(void)
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-				sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+						      sizes[INDEX_AC].cs_size,
+						      ARCH_KMALLOC_MINALIGN,
+						      (ARCH_KMALLOC_FLAGS |
+						       SLAB_PANIC), NULL, NULL);
 
 	if (INDEX_AC != INDEX_L3)
 		sizes[INDEX_L3].cs_cachep =
-			kmem_cache_create(names[INDEX_L3].name,
-				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+		    kmem_cache_create(names[INDEX_L3].name,
+				      sizes[INDEX_L3].cs_size,
+				      ARCH_KMALLOC_MINALIGN,
+				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+				      NULL);
 
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
@@ -1091,35 +1100,41 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if(!sizes->cs_cachep)
+		if (!sizes->cs_cachep)
 			sizes->cs_cachep = kmem_cache_create(names->name,
-				sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+							     sizes->cs_size,
+							     ARCH_KMALLOC_MINALIGN,
+							     (ARCH_KMALLOC_FLAGS
+							      | SLAB_PANIC),
+							     NULL, NULL);
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
-			offslab_limit = sizes->cs_size-sizeof(struct slab);
+			offslab_limit = sizes->cs_size - sizeof(struct slab);
 			offslab_limit /= sizeof(kmem_bufctl_t);
 		}
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-			(ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
-			NULL, NULL);
+							sizes->cs_size,
+							ARCH_KMALLOC_MINALIGN,
+							(ARCH_KMALLOC_FLAGS |
+							 SLAB_CACHE_DMA |
+							 SLAB_PANIC), NULL,
+							NULL);
 
 		sizes++;
 		names++;
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
-		void * ptr;
+		void *ptr;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 		local_irq_disable();
 		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, ac_data(&cache_cache),
-				sizeof(struct arraycache_init));
+		       sizeof(struct arraycache_init));
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
 
@@ -1127,11 +1142,11 @@ void __init kmem_cache_init(void)
 
 		local_irq_disable();
 		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
-				!= &initarray_generic.cache);
+		       != &initarray_generic.cache);
 		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
-				sizeof(struct arraycache_init));
+		       sizeof(struct arraycache_init));
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-						ptr;
+		    ptr;
 		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1154,16 @@ void __init kmem_cache_init(void)
 		int node;
 		/* Replace the static kmem_list3 structures for the boot cpu */
 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-				numa_node_id());
+			  numa_node_id());
 
 		for_each_online_node(node) {
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
-					&initkmem_list3[SIZE_AC+node], node);
+				  &initkmem_list3[SIZE_AC + node], node);
 
 			if (INDEX_AC != INDEX_L3) {
 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
-						&initkmem_list3[SIZE_L3+node],
-						node);
+					  &initkmem_list3[SIZE_L3 + node],
+					  node);
 			}
 		}
 	}
@@ -1156,10 +1171,10 @@ void __init kmem_cache_init(void)
 	/* 6) resize the head arrays to their final sizes */
 	{
 		kmem_cache_t *cachep;
-		down(&cache_chain_sem);
+		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-			enable_cpucache(cachep);
-		up(&cache_chain_sem);
+		    enable_cpucache(cachep);
+		mutex_unlock(&cache_chain_mutex);
 	}
 
 	/* Done! */
@@ -1184,7 +1199,7 @@ static int __init cpucache_init(void)
 	 * pages to gfp.
 	 */
 	for_each_online_cpu(cpu)
-		start_cpu_timer(cpu);
+	    start_cpu_timer(cpu);
 
 	return 0;
 }
@@ -1226,7 +1241,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
  */
 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 {
-	unsigned long i = (1<<cachep->gfporder);
+	unsigned long i = (1 << cachep->gfporder);
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
@@ -1239,13 +1254,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_pages((unsigned long)addr, cachep->gfporder);
-	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
-		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
 {
-	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
 	kmem_cache_t *cachep = slab_rcu->cachep;
 
 	kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1272,19 @@ static void kmem_rcu_free(struct rcu_head *head)
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
-				unsigned long caller)
+			    unsigned long caller)
 {
 	int size = obj_reallen(cachep);
 
-	addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+	addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
 
-	if (size < 5*sizeof(unsigned long))
+	if (size < 5 * sizeof(unsigned long))
 		return;
 
-	*addr++=0x12345678;
-	*addr++=caller;
-	*addr++=smp_processor_id();
-	size -= 3*sizeof(unsigned long);
+	*addr++ = 0x12345678;
+	*addr++ = caller;
+	*addr++ = smp_processor_id();
+	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
 		unsigned long svalue;
@@ -1277,7 +1292,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 		while (!kstack_end(sptr)) {
 			svalue = *sptr++;
 			if (kernel_text_address(svalue)) {
-				*addr++=svalue;
+				*addr++ = svalue;
 				size -= sizeof(unsigned long);
 				if (size <= sizeof(unsigned long))
 					break;
@@ -1285,25 +1300,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 		}
 
 	}
-	*addr++=0x87654321;
+	*addr++ = 0x87654321;
 }
 #endif
 
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
 	int size = obj_reallen(cachep);
-	addr = &((char*)addr)[obj_dbghead(cachep)];
+	addr = &((char *)addr)[obj_dbghead(cachep)];
 
 	memset(addr, val, size);
-	*(unsigned char *)(addr+size-1) = POISON_END;
+	*(unsigned char *)(addr + size - 1) = POISON_END;
 }
 
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	printk(KERN_ERR "%03x:", offset);
-	for (i=0;i<limit;i++) {
-		printk(" %02x", (unsigned char)data[offset+i]);
+	for (i = 0; i < limit; i++) {
+		printk(" %02x", (unsigned char)data[offset + i]);
 	}
 	printk("\n");
 }
@@ -1318,24 +1333,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-			*dbg_redzone1(cachep, objp),
-			*dbg_redzone2(cachep, objp));
+		       *dbg_redzone1(cachep, objp),
+		       *dbg_redzone2(cachep, objp));
 	}
 
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
-				*dbg_userword(cachep, objp));
+		       *dbg_userword(cachep, objp));
 		print_symbol("(%s)",
-				(unsigned long)*dbg_userword(cachep, objp));
+			     (unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
-	realobj = (char*)objp+obj_dbghead(cachep);
+	realobj = (char *)objp + obj_dbghead(cachep);
 	size = obj_reallen(cachep);
-	for (i=0; i<size && lines;i+=16, lines--) {
+	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
-		if (i+limit > size)
-			limit = size-i;
+		if (i + limit > size)
+			limit = size - i;
 		dump_line(realobj, i, limit);
 	}
 }
@@ -1346,27 +1361,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 	int size, i;
 	int lines = 0;
 
-	realobj = (char*)objp+obj_dbghead(cachep);
+	realobj = (char *)objp + obj_dbghead(cachep);
 	size = obj_reallen(cachep);
 
-	for (i=0;i<size;i++) {
+	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
-		if (i == size-1)
+		if (i == size - 1)
 			exp = POISON_END;
 		if (realobj[i] != exp) {
 			int limit;
 			/* Mismatch ! */
 			/* Print header */
 			if (lines == 0) {
-				printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
-						realobj, size);
+				printk(KERN_ERR
+				       "Slab corruption: start=%p, len=%d\n",
+				       realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
-			i = (i/16)*16;
+			i = (i / 16) * 16;
 			limit = 16;
-			if (i+limit > size)
-				limit = size-i;
+			if (i + limit > size)
+				limit = size - i;
 			dump_line(realobj, i, limit);
 			i += 16;
 			lines++;
@@ -1382,19 +1398,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		struct slab *slabp = page_get_slab(virt_to_page(objp));
 		int objnr;
 
-		objnr = (objp-slabp->s_mem)/cachep->objsize;
+		objnr = (objp - slabp->s_mem) / cachep->objsize;
 		if (objnr) {
-			objp = slabp->s_mem+(objnr-1)*cachep->objsize;
-			realobj = (char*)objp+obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
+			realobj = (char *)objp + obj_dbghead(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
-						realobj, size);
+			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
-		if (objnr+1 < cachep->num) {
-			objp = slabp->s_mem+(objnr+1)*cachep->objsize;
-			realobj = (char*)objp+obj_dbghead(cachep);
+		if (objnr + 1 < cachep->num) {
+			objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
+			realobj = (char *)objp + obj_dbghead(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
-						realobj, size);
+			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 	}
@@ -1405,7 +1421,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
  * Before calling the slab must have been unlinked from the cache.
  * The cache-lock is not held/needed.
  */
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1416,8 +1432,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
-				kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+			if ((cachep->objsize % PAGE_SIZE) == 0
+			    && OFF_SLAB(cachep))
+				kernel_map_pages(virt_to_page(objp),
+						 cachep->objsize / PAGE_SIZE,
+						 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
@@ -1427,20 +1446,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "start of a freed object "
-							"was overwritten");
+					   "was overwritten");
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "end of a freed object "
-							"was overwritten");
+					   "was overwritten");
 		}
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-			(cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+			(cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
 	}
 #else
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void* objp = slabp->s_mem+cachep->objsize*i;
-			(cachep->dtor)(objp, cachep, 0);
+			void *objp = slabp->s_mem + cachep->objsize * i;
+			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
 #endif
@@ -1448,7 +1467,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slab_rcu *slab_rcu;
 
-		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu = (struct slab_rcu *)slabp;
 		slab_rcu->cachep = cachep;
 		slab_rcu->addr = addr;
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,11 +1485,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
 	int node;
 
 	for_each_online_node(node) {
-		cachep->nodelists[node] = &initkmem_list3[index+node];
+		cachep->nodelists[node] = &initkmem_list3[index + node];
 		cachep->nodelists[node]->next_reap = jiffies +
-			REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    REAPTIMEOUT_LIST3 +
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+	}
+}
+
+/**
+ * calculate_slab_order - calculate size (page order) of slabs and the number
+ *                        of objects per slab.
+ *
+ * This could be made much more intelligent.  For now, try to avoid using
+ * high order pages for slabs.  When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+					  size_t align, gfp_t flags)
+{
+	size_t left_over = 0;
+
+	for (;; cachep->gfporder++) {
+		unsigned int num;
+		size_t remainder;
+
+		if (cachep->gfporder > MAX_GFP_ORDER) {
+			cachep->num = 0;
+			break;
+		}
+
+		cache_estimate(cachep->gfporder, size, align, flags,
+			       &remainder, &num);
+		if (!num)
+			continue;
+		/* More than offslab_limit objects will cause problems */
+		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+			break;
+
+		cachep->num = num;
+		left_over = remainder;
+
+		/*
+		 * Large number of objects is good, but very large slabs are
+		 * currently bad for the gfp()s.
+		 */
+		if (cachep->gfporder >= slab_break_gfp_order)
+			break;
+
+		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+			/* Acceptable internal fragmentation */
+			break;
 	}
+	return left_over;
 }
 
 /**
@@ -1519,16 +1585,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if ((!name) ||
-		in_interrupt() ||
-		(size < BYTES_PER_WORD) ||
-		(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
-		(dtor && !ctor)) {
-			printk(KERN_ERR "%s: Early error in slab %s\n",
-					__FUNCTION__, name);
-			BUG();
-		}
+	    in_interrupt() ||
+	    (size < BYTES_PER_WORD) ||
+	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
+		printk(KERN_ERR "%s: Early error in slab %s\n",
+		       __FUNCTION__, name);
+		BUG();
+	}
 
-	down(&cache_chain_sem);
+	mutex_lock(&cache_chain_mutex);
 
 	list_for_each(p, &cache_chain) {
 		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1546,11 +1611,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		set_fs(old_fs);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
-					pc->objsize);
+			       pc->objsize);
 			continue;
 		}
 
-		if (!strcmp(pc->name,name)) {
+		if (!strcmp(pc->name, name)) {
 			printk("kmem_cache_create: duplicate cache %s\n", name);
 			dump_stack();
 			goto oops;
@@ -1562,10 +1627,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 		/* No constructor, but inital state check requested */
 		printk(KERN_ERR "%s: No con, but init state check "
-				"requested - %s\n", __FUNCTION__, name);
+		       "requested - %s\n", __FUNCTION__, name);
 		flags &= ~SLAB_DEBUG_INITIAL;
 	}
-
 #if FORCED_DEBUG
 	/*
 	 * Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1637,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
-		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+	if ((size < 4096
+	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
@@ -1595,9 +1660,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
-	if (size & (BYTES_PER_WORD-1)) {
-		size += (BYTES_PER_WORD-1);
-		size &= ~(BYTES_PER_WORD-1);
+	if (size & (BYTES_PER_WORD - 1)) {
+		size += (BYTES_PER_WORD - 1);
+		size &= ~(BYTES_PER_WORD - 1);
 	}
 
 	/* calculate out the final buffer alignment: */
@@ -1608,7 +1673,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		 * objects into one cacheline.
 		 */
 		ralign = cache_line_size();
-		while (size <= ralign/2)
+		while (size <= ralign / 2)
 			ralign /= 2;
 	} else {
 		ralign = BYTES_PER_WORD;
@@ -1617,13 +1682,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (ralign < ARCH_SLAB_MINALIGN) {
 		ralign = ARCH_SLAB_MINALIGN;
 		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/* 3) caller mandated alignment: disables debug if necessary */
 	if (ralign < align) {
 		ralign = align;
 		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/* 4) Store it. Note that the debug code below can reduce
 	 *    the alignment to BYTES_PER_WORD.
@@ -1645,7 +1710,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 		/* add space for red zone words */
 		cachep->dbghead += BYTES_PER_WORD;
-		size += 2*BYTES_PER_WORD;
+		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
 		/* user store requires word alignment and
@@ -1656,7 +1721,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+	    && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
 		cachep->dbghead += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
@@ -1664,7 +1730,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
 
 	/* Determine if the slab management is 'on' or 'off' slab. */
-	if (size >= (PAGE_SIZE>>3))
+	if (size >= (PAGE_SIZE >> 3))
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
@@ -1681,47 +1747,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		 */
 		cachep->gfporder = 0;
 		cache_estimate(cachep->gfporder, size, align, flags,
-					&left_over, &cachep->num);
-	} else {
-		/*
-		 * Calculate size (in pages) of slabs, and the num of objs per
-		 * slab.  This could be made much more intelligent.  For now,
-		 * try to avoid using high page-orders for slabs.  When the
-		 * gfp() funcs are more friendly towards high-order requests,
-		 * this should be changed.
-		 */
-		do {
-			unsigned int break_flag = 0;
-cal_wastage:
-			cache_estimate(cachep->gfporder, size, align, flags,
-						&left_over, &cachep->num);
-			if (break_flag)
-				break;
-			if (cachep->gfporder >= MAX_GFP_ORDER)
-				break;
-			if (!cachep->num)
-				goto next;
-			if (flags & CFLGS_OFF_SLAB &&
-					cachep->num > offslab_limit) {
-				/* This num of objs will cause problems. */
-				cachep->gfporder--;
-				break_flag++;
-				goto cal_wastage;
-			}
-
-			/*
-			 * Large num of objs is good, but v. large slabs are
-			 * currently bad for the gfp()s.
-			 */
-			if (cachep->gfporder >= slab_break_gfp_order)
-				break;
-
-			if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
-				break;	/* Acceptable internal fragmentation. */
-next:
-			cachep->gfporder++;
-		} while (1);
-	}
+			       &left_over, &cachep->num);
+	} else
+		left_over = calculate_slab_order(cachep, size, align, flags);
 
 	if (!cachep->num) {
 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1757,8 @@ next:
 		cachep = NULL;
 		goto oops;
 	}
-	slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
-				+ sizeof(struct slab), align);
+	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+			  + sizeof(struct slab), align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -1743,14 +1771,15 @@ next:
 
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
-		slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+		slab_size =
+		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
 	}
 
 	cachep->colour_off = cache_line_size();
 	/* Offset must be a multiple of the alignment. */
 	if (cachep->colour_off < align)
 		cachep->colour_off = align;
-	cachep->colour = left_over/cachep->colour_off;
+	cachep->colour = left_over / cachep->colour_off;
 	cachep->slab_size = slab_size;
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
@@ -1777,7 +1806,7 @@ next:
 			 * the creation of further caches will BUG().
 			 */
 			cachep->array[smp_processor_id()] =
-				&initarray_generic.cache;
+			    &initarray_generic.cache;
 
 			/* If the cache that's used by
 			 * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +1820,7 @@ next:
 				g_cpucache_up = PARTIAL_AC;
 		} else {
 			cachep->array[smp_processor_id()] =
-				kmalloc(sizeof(struct arraycache_init),
-						GFP_KERNEL);
+			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 			if (g_cpucache_up == PARTIAL_AC) {
 				set_up_list3s(cachep, SIZE_L3);
@@ -1802,16 +1830,18 @@ next:
 				for_each_online_node(node) {
 
 					cachep->nodelists[node] =
-						kmalloc_node(sizeof(struct kmem_list3),
-								GFP_KERNEL, node);
+					    kmalloc_node(sizeof
+							 (struct kmem_list3),
+							 GFP_KERNEL, node);
 					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->nodelists[node]);
+					kmem_list3_init(cachep->
+							nodelists[node]);
 				}
 			}
 		}
 		cachep->nodelists[numa_node_id()]->next_reap =
-			jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    jiffies + REAPTIMEOUT_LIST3 +
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
 		BUG_ON(!ac_data(cachep));
 		ac_data(cachep)->avail = 0;
@@ -1820,16 +1850,16 @@ next:
 		ac_data(cachep)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	} 
+	}
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
 	unlock_cpu_hotplug();
-oops:
+      oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
-			name);
-	up(&cache_chain_sem);
+		      name);
+	mutex_unlock(&cache_chain_mutex);
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -1871,7 +1901,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 /*
  * Waits for all CPUs to execute func().
  */
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
 	check_irq_on();
 	preempt_disable();
@@ -1886,12 +1916,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
 	preempt_enable();
 }
 
-static void drain_array_locked(kmem_cache_t* cachep,
-				struct array_cache *ac, int force, int node);
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+				int force, int node);
 
 static void do_drain(void *arg)
 {
-	kmem_cache_t *cachep = (kmem_cache_t*)arg;
+	kmem_cache_t *cachep = (kmem_cache_t *) arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -1911,7 +1941,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
-	for_each_online_node(node)  {
+	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
 			spin_lock(&l3->list_lock);
@@ -1949,8 +1979,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
 		slab_destroy(cachep, slabp);
 		spin_lock_irq(&l3->list_lock);
 	}
-	ret = !list_empty(&l3->slabs_full) ||
-		!list_empty(&l3->slabs_partial);
+	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
 	return ret;
 }
 
@@ -2006,7 +2035,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * The caller must guarantee that noone will allocate memory from the cache
  * during the kmem_cache_destroy().
  */
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(kmem_cache_t *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2018,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 	lock_cpu_hotplug();
 
 	/* Find the cache in the chain of caches. */
-	down(&cache_chain_sem);
+	mutex_lock(&cache_chain_mutex);
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
 	 */
 	list_del(&cachep->next);
-	up(&cache_chain_sem);
+	mutex_unlock(&cache_chain_mutex);
 
 	if (__cache_shrink(cachep)) {
 		slab_error(cachep, "Can't free all objects");
-		down(&cache_chain_sem);
-		list_add(&cachep->next,&cache_chain);
-		up(&cache_chain_sem);
+		mutex_lock(&cache_chain_mutex);
+		list_add(&cachep->next, &cache_chain);
+		mutex_unlock(&cache_chain_mutex);
 		unlock_cpu_hotplug();
 		return 1;
 	}
@@ -2038,7 +2067,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 		synchronize_rcu();
 
 	for_each_online_cpu(i)
-		kfree(cachep->array[i]);
+	    kfree(cachep->array[i]);
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
@@ -2057,39 +2086,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-			int colour_off, gfp_t local_flags)
+static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+				   int colour_off, gfp_t local_flags)
 {
 	struct slab *slabp;
-	
+
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
-		slabp = objp+colour_off;
+		slabp = objp + colour_off;
 		colour_off += cachep->slab_size;
 	}
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
-	slabp->s_mem = objp+colour_off;
+	slabp->s_mem = objp + colour_off;
 
 	return slabp;
 }
 
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
-	return (kmem_bufctl_t *)(slabp+1);
+	return (kmem_bufctl_t *) (slabp + 1);
 }
 
 static void cache_init_objs(kmem_cache_t *cachep,
-			struct slab *slabp, unsigned long ctor_flags)
+			    struct slab *slabp, unsigned long ctor_flags)
 {
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem+cachep->objsize*i;
+		void *objp = slabp->s_mem + cachep->objsize * i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2107,25 +2136,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
 		 * Otherwise, deadlock. They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+			cachep->ctor(objp + obj_dbghead(cachep), cachep,
+				     ctor_flags);
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
-							" end of an object");
+					   " end of an object");
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
-							" start of an object");
+					   " start of an object");
 		}
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
-	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+		    && cachep->flags & SLAB_POISON)
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->objsize / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
 #endif
-		slab_bufctl(slabp)[i] = i+1;
+		slab_bufctl(slabp)[i] = i + 1;
 	}
-	slab_bufctl(slabp)[i-1] = BUFCTL_END;
+	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
 	slabp->free = 0;
 }
 
@@ -2161,17 +2193,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
  */
 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
-	struct slab	*slabp;
-	void		*objp;
-	size_t		 offset;
-	gfp_t	 	 local_flags;
-	unsigned long	 ctor_flags;
+	struct slab *slabp;
+	void *objp;
+	size_t offset;
+	gfp_t local_flags;
+	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 
 	/* Be lazy and only check for valid flags here,
- 	 * keeping it out of the critical path in kmem_cache_alloc().
+	 * keeping it out of the critical path in kmem_cache_alloc().
 	 */
-	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
 		BUG();
 	if (flags & SLAB_NO_GROW)
 		return 0;
@@ -2237,9 +2269,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
-opps1:
+      opps1:
 	kmem_freepages(cachep, objp);
-failed:
+      failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
@@ -2259,18 +2291,19 @@ static void kfree_debugcheck(const void *objp)
 
 	if (!virt_addr_valid(objp)) {
 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
-			(unsigned long)objp);	
-		BUG();	
+		       (unsigned long)objp);
+		BUG();
 	}
 	page = virt_to_page(objp);
 	if (!PageSlab(page)) {
-		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+		       (unsigned long)objp);
 		BUG();
 	}
 }
 
 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
-					void *caller)
+				   void *caller)
 {
 	struct page *page;
 	unsigned int objnr;
@@ -2281,20 +2314,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	page = virt_to_page(objp);
 
 	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
-				page_get_cache(page),cachep);
+		printk(KERN_ERR
+		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+		       page_get_cache(page), cachep);
 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-		printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
+		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+		       page_get_cache(page)->name);
 		WARN_ON(1);
 	}
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
+		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep,
+				   "double free, or memory outside"
+				   " object was overwritten");
+			printk(KERN_ERR
+			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+			       objp, *dbg_redzone1(cachep, objp),
+			       *dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2341,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (objp-slabp->s_mem)/cachep->objsize;
+	objnr = (objp - slabp->s_mem) / cachep->objsize;
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
 		 * caller can perform a verify of its state (debugging).
 		 * Called without the cache-lock held.
 		 */
-		cachep->ctor(objp+obj_dbghead(cachep),
-					cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+		cachep->ctor(objp + obj_dbghead(cachep),
+			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
 	}
 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
 		/* we want to cache poison the object,
 		 * call the destruction callback
 		 */
-		cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+		cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
-	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->objsize / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
@@ -2340,7 +2380,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 {
 	kmem_bufctl_t i;
 	int entries = 0;
-	
+
 	/* Check slab's freelist to see if this obj is there. */
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
@@ -2348,13 +2388,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
-bad:
-		printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-				cachep->name, cachep->num, slabp, slabp->inuse);
-		for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
-			if ((i%16)==0)
+	      bad:
+		printk(KERN_ERR
+		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+		       cachep->name, cachep->num, slabp, slabp->inuse);
+		for (i = 0;
+		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+		     i++) {
+			if ((i % 16) == 0)
 				printk("\n%03x:", i);
-			printk(" %02x", ((unsigned char*)slabp)[i]);
+			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
 		printk("\n");
 		BUG();
@@ -2374,7 +2417,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 
 	check_irq_off();
 	ac = ac_data(cachep);
-retry:
+      retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 		/* if there was little recent activity on this
@@ -2396,8 +2439,8 @@ retry:
 			shared_array->avail -= batchcount;
 			ac->avail = batchcount;
 			memcpy(ac->entry,
-				&(shared_array->entry[shared_array->avail]),
-				sizeof(void*)*batchcount);
+			       &(shared_array->entry[shared_array->avail]),
+			       sizeof(void *) * batchcount);
 			shared_array->touched = 1;
 			goto alloc_done;
 		}
@@ -2425,7 +2468,7 @@ retry:
 
 			/* get obj pointer */
 			ac->entry[ac->avail++] = slabp->s_mem +
-				slabp->free*cachep->objsize;
+			    slabp->free * cachep->objsize;
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
@@ -2433,7 +2476,7 @@ retry:
 			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 			WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
-		       	slabp->free = next;
+			slabp->free = next;
 		}
 		check_slabp(cachep, slabp);
 
@@ -2445,9 +2488,9 @@ retry:
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 
-must_grow:
+      must_grow:
 	l3->free_objects -= ac->avail;
-alloc_done:
+      alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
@@ -2459,7 +2502,7 @@ alloc_done:
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
-		if (!ac->avail)		// objects refilled by interrupt?
+		if (!ac->avail)	// objects refilled by interrupt?
 			goto retry;
 	}
 	ac->touched = 1;
@@ -2476,16 +2519,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-			gfp_t flags, void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
+					void *objp, void *caller)
 {
-	if (!objp)	
+	if (!objp)
 		return objp;
- 	if (cachep->flags & SLAB_POISON) {
+	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-			kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->objsize / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
@@ -2497,24 +2540,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 		*dbg_userword(cachep, objp) = caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
+		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep,
+				   "double free, or memory outside"
+				   " object was overwritten");
+			printk(KERN_ERR
+			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+			       objp, *dbg_redzone1(cachep, objp),
+			       *dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
 	objp += obj_dbghead(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
-		unsigned long	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 
 		if (!(flags & __GFP_WAIT))
 			ctor_flags |= SLAB_CTOR_ATOMIC;
 
 		cachep->ctor(objp, cachep, ctor_flags);
-	}	
+	}
 	return objp;
 }
 #else
@@ -2523,9 +2570,18 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 
 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
-	void* objp;
+	void *objp;
 	struct array_cache *ac;
 
+#ifdef CONFIG_NUMA
+	if (unlikely(current->mempolicy && !in_interrupt())) {
+		int nid = slab_node(current->mempolicy);
+
+		if (nid != numa_node_id())
+			return __cache_alloc_node(cachep, flags, nid);
+	}
+#endif
+
 	check_irq_off();
 	ac = ac_data(cachep);
 	if (likely(ac->avail)) {
@@ -2542,7 +2598,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
 	unsigned long save_flags;
-	void* objp;
+	void *objp;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 
@@ -2550,7 +2606,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 	objp = ____cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-					__builtin_return_address(0));
+					    __builtin_return_address(0));
 	prefetchw(objp);
 	return objp;
 }
@@ -2562,74 +2618,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
 	struct list_head *entry;
- 	struct slab *slabp;
- 	struct kmem_list3 *l3;
- 	void *obj;
- 	kmem_bufctl_t next;
- 	int x;
-
- 	l3 = cachep->nodelists[nodeid];
- 	BUG_ON(!l3);
-
-retry:
- 	spin_lock(&l3->list_lock);
- 	entry = l3->slabs_partial.next;
- 	if (entry == &l3->slabs_partial) {
- 		l3->free_touched = 1;
- 		entry = l3->slabs_free.next;
- 		if (entry == &l3->slabs_free)
- 			goto must_grow;
- 	}
-
- 	slabp = list_entry(entry, struct slab, list);
- 	check_spinlock_acquired_node(cachep, nodeid);
- 	check_slabp(cachep, slabp);
-
- 	STATS_INC_NODEALLOCS(cachep);
- 	STATS_INC_ACTIVE(cachep);
- 	STATS_SET_HIGH(cachep);
-
- 	BUG_ON(slabp->inuse == cachep->num);
-
- 	/* get obj pointer */
- 	obj =  slabp->s_mem + slabp->free*cachep->objsize;
- 	slabp->inuse++;
- 	next = slab_bufctl(slabp)[slabp->free];
+	struct slab *slabp;
+	struct kmem_list3 *l3;
+	void *obj;
+	kmem_bufctl_t next;
+	int x;
+
+	l3 = cachep->nodelists[nodeid];
+	BUG_ON(!l3);
+
+      retry:
+	spin_lock(&l3->list_lock);
+	entry = l3->slabs_partial.next;
+	if (entry == &l3->slabs_partial) {
+		l3->free_touched = 1;
+		entry = l3->slabs_free.next;
+		if (entry == &l3->slabs_free)
+			goto must_grow;
+	}
+
+	slabp = list_entry(entry, struct slab, list);
+	check_spinlock_acquired_node(cachep, nodeid);
+	check_slabp(cachep, slabp);
+
+	STATS_INC_NODEALLOCS(cachep);
+	STATS_INC_ACTIVE(cachep);
+	STATS_SET_HIGH(cachep);
+
+	BUG_ON(slabp->inuse == cachep->num);
+
+	/* get obj pointer */
+	obj = slabp->s_mem + slabp->free * cachep->objsize;
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
- 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 #endif
- 	slabp->free = next;
- 	check_slabp(cachep, slabp);
- 	l3->free_objects--;
- 	/* move slabp to correct slabp list: */
- 	list_del(&slabp->list);
-
- 	if (slabp->free == BUFCTL_END) {
- 		list_add(&slabp->list, &l3->slabs_full);
- 	} else {
- 		list_add(&slabp->list, &l3->slabs_partial);
- 	}
+	slabp->free = next;
+	check_slabp(cachep, slabp);
+	l3->free_objects--;
+	/* move slabp to correct slabp list: */
+	list_del(&slabp->list);
+
+	if (slabp->free == BUFCTL_END) {
+		list_add(&slabp->list, &l3->slabs_full);
+	} else {
+		list_add(&slabp->list, &l3->slabs_partial);
+	}
 
- 	spin_unlock(&l3->list_lock);
- 	goto done;
+	spin_unlock(&l3->list_lock);
+	goto done;
 
-must_grow:
- 	spin_unlock(&l3->list_lock);
- 	x = cache_grow(cachep, flags, nodeid);
+      must_grow:
+	spin_unlock(&l3->list_lock);
+	x = cache_grow(cachep, flags, nodeid);
 
- 	if (!x)
- 		return NULL;
+	if (!x)
+		return NULL;
 
- 	goto retry;
-done:
- 	return obj;
+	goto retry;
+      done:
+	return obj;
 }
 #endif
 
 /*
  * Caller needs to acquire correct kmem_list's list_lock
  */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+		       int node)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2652,7 +2709,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
 
 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 			printk(KERN_ERR "slab: double free detected in cache "
-					"'%s', objp %p\n", cachep->name, objp);
+			       "'%s', objp %p\n", cachep->name, objp);
 			BUG();
 		}
 #endif
@@ -2696,20 +2753,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
 	spin_lock(&l3->list_lock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
-		int max = shared_array->limit-shared_array->avail;
+		int max = shared_array->limit - shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
 			memcpy(&(shared_array->entry[shared_array->avail]),
-					ac->entry,
-					sizeof(void*)*batchcount);
+			       ac->entry, sizeof(void *) * batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 
 	free_block(cachep, ac->entry, batchcount, node);
-free_done:
+      free_done:
 #if STATS
 	{
 		int i = 0;
@@ -2731,10 +2787,9 @@ free_done:
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
 	memmove(ac->entry, &(ac->entry[batchcount]),
-			sizeof(void*)*ac->avail);
+		sizeof(void *) * ac->avail);
 }
 
-
 /*
  * __cache_free
  * Release an obj back to its cache. If the obj has a constructed
@@ -2759,7 +2814,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+			struct kmem_list3 *l3 =
+			    cachep->nodelists[numa_node_id()];
 
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
@@ -2767,15 +2823,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 				spin_lock(&alien->lock);
 				if (unlikely(alien->avail == alien->limit))
 					__drain_alien_cache(cachep,
-							alien, nodeid);
+							    alien, nodeid);
 				alien->entry[alien->avail++] = objp;
 				spin_unlock(&alien->lock);
 			} else {
 				spin_lock(&(cachep->nodelists[nodeid])->
-						list_lock);
+					  list_lock);
 				free_block(cachep, &objp, 1, nodeid);
 				spin_unlock(&(cachep->nodelists[nodeid])->
-						list_lock);
+					    list_lock);
 			}
 			return;
 		}
@@ -2822,9 +2878,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
  */
 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 {
-	unsigned long addr = (unsigned long) ptr;
+	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
-	unsigned long align_mask = BYTES_PER_WORD-1;
+	unsigned long align_mask = BYTES_PER_WORD - 1;
 	unsigned long size = cachep->objsize;
 	struct page *page;
 
@@ -2844,7 +2900,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
-out:
+      out:
 	return 0;
 }
 
@@ -2871,8 +2927,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 
 	if (unlikely(!cachep->nodelists[nodeid])) {
 		/* Fall back to __cache_alloc if we run into trouble */
-		printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
-		return __cache_alloc(cachep,flags);
+		printk(KERN_WARNING
+		       "slab: not allocating in inactive node %d for cache %s\n",
+		       nodeid, cachep->name);
+		return __cache_alloc(cachep, flags);
 	}
 
 	cache_alloc_debugcheck_before(cachep, flags);
@@ -2882,7 +2940,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
 	local_irq_restore(save_flags);
-	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+	ptr =
+	    cache_alloc_debugcheck_after(cachep, flags, ptr,
+					 __builtin_return_address(0));
 
 	return ptr;
 }
@@ -2944,12 +3004,11 @@ EXPORT_SYMBOL(__kmalloc);
  * Objects should be dereferenced using the per_cpu_ptr macro only.
  *
  * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
  */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
 	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
 
 	if (!pdata)
 		return NULL;
@@ -2973,9 +3032,9 @@ void *__alloc_percpu(size_t size, size_t align)
 	}
 
 	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
+	return (void *)(~(unsigned long)pdata);
 
-unwind_oom:
+      unwind_oom:
 	while (--i >= 0) {
 		if (!cpu_possible(i))
 			continue;
@@ -3006,20 +3065,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
-	void *ret = kmalloc(size, flags);
-	if (ret)
-		memset(ret, 0, size);
-	return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-
-/**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
  *
@@ -3038,7 +3083,8 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = page_get_cache(virt_to_page(objp));
-	__cache_free(c, (void*)objp);
+	mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
+	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
@@ -3051,17 +3097,16 @@ EXPORT_SYMBOL(kfree);
  * Don't free memory not originally allocated by alloc_percpu()
  * The complemented objp is to check for that.
  */
-void
-free_percpu(const void *objp)
+void free_percpu(const void *objp)
 {
 	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
 
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
 	for_each_cpu(i)
-		kfree(p->ptrs[i]);
+	    kfree(p->ptrs[i]);
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -3095,44 +3140,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
 		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
 			goto fail;
 #endif
-		if (!(new = alloc_arraycache(node, (cachep->shared*
-				cachep->batchcount), 0xbaadf00d)))
+		if (!(new = alloc_arraycache(node, (cachep->shared *
+						    cachep->batchcount),
+					     0xbaadf00d)))
 			goto fail;
 		if ((l3 = cachep->nodelists[node])) {
 
 			spin_lock_irq(&l3->list_lock);
 
 			if ((nc = cachep->nodelists[node]->shared))
-				free_block(cachep, nc->entry,
-							nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node);
 
 			l3->shared = new;
 			if (!cachep->nodelists[node]->alien) {
 				l3->alien = new_alien;
 				new_alien = NULL;
 			}
-			l3->free_limit = (1 + nr_cpus_node(node))*
-				cachep->batchcount + cachep->num;
+			l3->free_limit = (1 + nr_cpus_node(node)) *
+			    cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
 		}
 		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-						GFP_KERNEL, node)))
+					GFP_KERNEL, node)))
 			goto fail;
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new;
 		l3->alien = new_alien;
-		l3->free_limit = (1 + nr_cpus_node(node))*
-			cachep->batchcount + cachep->num;
+		l3->free_limit = (1 + nr_cpus_node(node)) *
+		    cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return err;
-fail:
+      fail:
 	err = -ENOMEM;
 	return err;
 }
@@ -3154,18 +3199,19 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-
 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
-				int shared)
+			    int shared)
 {
 	struct ccupdate_struct new;
 	int i, err;
 
-	memset(&new.new,0,sizeof(new.new));
+	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		new.new[i] =
+		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
 		if (!new.new[i]) {
-			for (i--; i >= 0; i--) kfree(new.new[i]);
+			for (i--; i >= 0; i--)
+				kfree(new.new[i]);
 			return -ENOMEM;
 		}
 	}
@@ -3193,13 +3239,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
 	err = alloc_kmemlist(cachep);
 	if (err) {
 		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-				cachep->name, -err);
+		       cachep->name, -err);
 		BUG();
 	}
 	return 0;
 }
 
-
 static void enable_cpucache(kmem_cache_t *cachep)
 {
 	int err;
@@ -3246,14 +3291,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-					cachep->name, -err);
+		       cachep->name, -err);
 }
 
-static void drain_array_locked(kmem_cache_t *cachep,
-				struct array_cache *ac, int force, int node)
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+				int force, int node)
 {
 	int tofree;
 
@@ -3261,14 +3306,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
-		tofree = force ? ac->avail : (ac->limit+4)/5;
+		tofree = force ? ac->avail : (ac->limit + 4) / 5;
 		if (tofree > ac->avail) {
-			tofree = (ac->avail+1)/2;
+			tofree = (ac->avail + 1) / 2;
 		}
 		free_block(cachep, ac->entry, tofree, node);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
-					sizeof(void*)*ac->avail);
+			sizeof(void *) * ac->avail);
 	}
 }
 
@@ -3281,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
- * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll
  * try again on the next iteration.
  */
 static void cache_reap(void *unused)
@@ -3289,15 +3334,16 @@ static void cache_reap(void *unused)
 	struct list_head *walk;
 	struct kmem_list3 *l3;
 
-	if (down_trylock(&cache_chain_sem)) {
+	if (!mutex_trylock(&cache_chain_mutex)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+		schedule_delayed_work(&__get_cpu_var(reap_work),
+				      REAPTIMEOUT_CPUC);
 		return;
 	}
 
 	list_for_each(walk, &cache_chain) {
 		kmem_cache_t *searchp;
-		struct list_head* p;
+		struct list_head *p;
 		int tofree;
 		struct slab *slabp;
 
@@ -3314,7 +3360,7 @@ static void cache_reap(void *unused)
 		spin_lock_irq(&l3->list_lock);
 
 		drain_array_locked(searchp, ac_data(searchp), 0,
-				numa_node_id());
+				   numa_node_id());
 
 		if (time_after(l3->next_reap, jiffies))
 			goto next_unlock;
@@ -3323,14 +3369,16 @@ static void cache_reap(void *unused)
 
 		if (l3->shared)
 			drain_array_locked(searchp, l3->shared, 0,
-				numa_node_id());
+					   numa_node_id());
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
 			goto next_unlock;
 		}
 
-		tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+		tofree =
+		    (l3->free_limit + 5 * searchp->num -
+		     1) / (5 * searchp->num);
 		do {
 			p = l3->slabs_free.next;
 			if (p == &(l3->slabs_free))
@@ -3350,14 +3398,14 @@ static void cache_reap(void *unused)
 			spin_unlock_irq(&l3->list_lock);
 			slab_destroy(searchp, slabp);
 			spin_lock_irq(&l3->list_lock);
-		} while(--tofree > 0);
-next_unlock:
+		} while (--tofree > 0);
+	      next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-next:
+	      next:
 		cond_resched();
 	}
 	check_irq_on();
-	up(&cache_chain_sem);
+	mutex_unlock(&cache_chain_mutex);
 	drain_remote_pages();
 	/* Setup the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3365,32 +3413,37 @@ next:
 
 #ifdef CONFIG_PROC_FS
 
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
 {
-	loff_t n = *pos;
-	struct list_head *p;
-
-	down(&cache_chain_sem);
-	if (!n) {
-		/*
-		 * Output format version, so at least we can change it
-		 * without _too_ many complaints.
-		 */
+	/*
+	 * Output format version, so at least we can change it
+	 * without _too_ many complaints.
+	 */
 #if STATS
-		seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
-		seq_puts(m, "slabinfo - version: 2.1\n");
+	seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-		seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
-		seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-		seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+		 "<objperslab> <pagesperslab>");
+	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
-		seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
-				" <error> <maxfreeable> <nodeallocs> <remotefrees>");
-		seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
+	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
-		seq_putc(m, '\n');
-	}
+	seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct list_head *p;
+
+	mutex_lock(&cache_chain_mutex);
+	if (!n)
+		print_slabinfo_header(m);
 	p = cache_chain.next;
 	while (n--) {
 		p = p->next;
@@ -3405,23 +3458,23 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 	kmem_cache_t *cachep = p;
 	++*pos;
 	return cachep->next.next == &cache_chain ? NULL
-		: list_entry(cachep->next.next, kmem_cache_t, next);
+	    : list_entry(cachep->next.next, kmem_cache_t, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
-	up(&cache_chain_sem);
+	mutex_unlock(&cache_chain_mutex);
 }
 
 static int s_show(struct seq_file *m, void *p)
 {
 	kmem_cache_t *cachep = p;
 	struct list_head *q;
-	struct slab	*slabp;
-	unsigned long	active_objs;
-	unsigned long	num_objs;
-	unsigned long	active_slabs = 0;
-	unsigned long	num_slabs, free_objects = 0, shared_avail = 0;
+	struct slab *slabp;
+	unsigned long active_objs;
+	unsigned long num_objs;
+	unsigned long active_slabs = 0;
+	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
 	int node;
@@ -3438,14 +3491,14 @@ static int s_show(struct seq_file *m, void *p)
 
 		spin_lock(&l3->list_lock);
 
-		list_for_each(q,&l3->slabs_full) {
+		list_for_each(q, &l3->slabs_full) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse != cachep->num && !error)
 				error = "slabs_full accounting error";
 			active_objs += cachep->num;
 			active_slabs++;
 		}
-		list_for_each(q,&l3->slabs_partial) {
+		list_for_each(q, &l3->slabs_partial) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse == cachep->num && !error)
 				error = "slabs_partial inuse accounting error";
@@ -3454,7 +3507,7 @@ static int s_show(struct seq_file *m, void *p)
 			active_objs += slabp->inuse;
 			active_slabs++;
 		}
-		list_for_each(q,&l3->slabs_free) {
+		list_for_each(q, &l3->slabs_free) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse && !error)
 				error = "slabs_free/inuse accounting error";
@@ -3465,25 +3518,24 @@ static int s_show(struct seq_file *m, void *p)
 
 		spin_unlock(&l3->list_lock);
 	}
-	num_slabs+=active_slabs;
-	num_objs = num_slabs*cachep->num;
+	num_slabs += active_slabs;
+	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
 
-	name = cachep->name; 
+	name = cachep->name;
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-		name, active_objs, num_objs, cachep->objsize,
-		cachep->num, (1<<cachep->gfporder));
+		   name, active_objs, num_objs, cachep->objsize,
+		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
-			cachep->limit, cachep->batchcount,
-			cachep->shared);
+		   cachep->limit, cachep->batchcount, cachep->shared);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
-			active_slabs, num_slabs, shared_avail);
+		   active_slabs, num_slabs, shared_avail);
 #if STATS
-	{	/* list3 stats */
+	{			/* list3 stats */
 		unsigned long high = cachep->high_mark;
 		unsigned long allocs = cachep->num_allocations;
 		unsigned long grown = cachep->grown;
@@ -3494,9 +3546,7 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long node_frees = cachep->node_frees;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu",
-				allocs, high, grown, reaped, errors,
-				max_freeable, node_allocs, node_frees);
+				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3506,7 +3556,7 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long freemiss = atomic_read(&cachep->freemiss);
 
 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-			allochit, allocmiss, freehit, freemiss);
+			   allochit, allocmiss, freehit, freemiss);
 	}
 #endif
 	seq_putc(m, '\n');
@@ -3529,10 +3579,10 @@ static int s_show(struct seq_file *m, void *p)
  */
 
 struct seq_operations slabinfo_op = {
-	.start	= s_start,
-	.next	= s_next,
-	.stop	= s_stop,
-	.show	= s_show,
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show,
 };
 
 #define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3593,18 @@ struct seq_operations slabinfo_op = {
  * @count: data length
  * @ppos: unused
  */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *ppos)
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+		       size_t count, loff_t *ppos)
 {
-	char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
 	int limit, batchcount, shared, res;
 	struct list_head *p;
-	
+
 	if (count > MAX_SLABINFO_WRITE)
 		return -EINVAL;
 	if (copy_from_user(&kbuf, buffer, count))
 		return -EFAULT;
-	kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+	kbuf[MAX_SLABINFO_WRITE] = '\0';
 
 	tmp = strchr(kbuf, ' ');
 	if (!tmp)
@@ -3565,25 +3615,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		return -EINVAL;
 
 	/* Find the cache in the chain of caches. */
-	down(&cache_chain_sem);
+	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
-	list_for_each(p,&cache_chain) {
+	list_for_each(p, &cache_chain) {
 		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
 
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 ||
 			    batchcount < 1 ||
-			    batchcount > limit ||
-			    shared < 0) {
+			    batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-							batchcount, shared);
+						       batchcount, shared);
 			}
 			break;
 		}
 	}
-	up(&cache_chain_sem);
+	mutex_unlock(&cache_chain_mutex);
 	if (res >= 0)
 		res = count;
 	return res;
@@ -3609,26 +3658,3 @@ unsigned int ksize(const void *objp)
 
 	return obj_reallen(page_get_cache(virt_to_page(objp)));
 }
-
-
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
-	size_t len;
-	char *buf;
-
-	if (!s)
-		return NULL;
-
-	len = strlen(s) + 1;
-	buf = kmalloc(len, gfp);
-	if (buf)
-		memcpy(buf, s, len);
-	return buf;
-}
-EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 00000000000..1c240c4b71d
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+	int units;
+	struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+	int order;
+	void *pages;
+	struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+	slob_t *prev, *cur, *aligned = 0;
+	int delta = 0, units = SLOB_UNITS(size);
+	unsigned long flags;
+
+	spin_lock_irqsave(&slob_lock, flags);
+	prev = slobfree;
+	for (cur = prev->next; ; prev = cur, cur = cur->next) {
+		if (align) {
+			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			delta = aligned - cur;
+		}
+		if (cur->units >= units + delta) { /* room enough? */
+			if (delta) { /* need to fragment head to align? */
+				aligned->units = cur->units - delta;
+				aligned->next = cur->next;
+				cur->next = aligned;
+				cur->units = delta;
+				prev = cur;
+				cur = aligned;
+			}
+
+			if (cur->units == units) /* exact fit? */
+				prev->next = cur->next; /* unlink */
+			else { /* fragment */
+				prev->next = cur + units;
+				prev->next->units = cur->units - units;
+				prev->next->next = cur->next;
+				cur->units = units;
+			}
+
+			slobfree = prev;
+			spin_unlock_irqrestore(&slob_lock, flags);
+			return cur;
+		}
+		if (cur == slobfree) {
+			spin_unlock_irqrestore(&slob_lock, flags);
+
+			if (size == PAGE_SIZE) /* trying to shrink arena? */
+				return 0;
+
+			cur = (slob_t *)__get_free_page(gfp);
+			if (!cur)
+				return 0;
+
+			slob_free(cur, PAGE_SIZE);
+			spin_lock_irqsave(&slob_lock, flags);
+			cur = slobfree;
+		}
+	}
+}
+
+static void slob_free(void *block, int size)
+{
+	slob_t *cur, *b = (slob_t *)block;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (size)
+		b->units = SLOB_UNITS(size);
+
+	/* Find reinsertion point */
+	spin_lock_irqsave(&slob_lock, flags);
+	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+		if (cur >= cur->next && (b > cur || b < cur->next))
+			break;
+
+	if (b + b->units == cur->next) {
+		b->units += cur->next->units;
+		b->next = cur->next->next;
+	} else
+		b->next = cur->next;
+
+	if (cur + cur->units == b) {
+		cur->units += b->units;
+		cur->next = b->next;
+	} else
+		cur->next = b;
+
+	slobfree = cur;
+
+	spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+	int order = 0;
+	for ( ; size > 4096 ; size >>=1)
+		order++;
+	return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+	slob_t *m;
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (size < PAGE_SIZE - SLOB_UNIT) {
+		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+		return m ? (void *)(m + 1) : 0;
+	}
+
+	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	if (!bb)
+		return 0;
+
+	bb->order = find_order(size);
+	bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+	if (bb->pages) {
+		spin_lock_irqsave(&block_lock, flags);
+		bb->next = bigblocks;
+		bigblocks = bb;
+		spin_unlock_irqrestore(&block_lock, flags);
+		return bb->pages;
+	}
+
+	slob_free(bb, sizeof(bigblock_t));
+	return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+	bigblock_t *bb, **last = &bigblocks;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		/* might be on the big block list */
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+			if (bb->pages == block) {
+				*last = bb->next;
+				spin_unlock_irqrestore(&block_lock, flags);
+				free_pages((unsigned long)block, bb->order);
+				slob_free(bb, sizeof(bigblock_t));
+				return;
+			}
+		}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	slob_free((slob_t *)block - 1, 0);
+	return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (!block)
+		return 0;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; bb = bb->next)
+			if (bb->pages == block) {
+				spin_unlock_irqrestore(&slob_lock, flags);
+				return PAGE_SIZE << bb->order;
+			}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+	size_t align, unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+	struct kmem_cache *c;
+
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+	if (c) {
+		c->name = name;
+		c->size = size;
+		c->ctor = ctor;
+		c->dtor = dtor;
+		/* ignore alignment unless it's forced */
+		c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+		if (c->align < align)
+			c->align = align;
+	}
+
+	return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+	slob_free(c, sizeof(struct kmem_cache));
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *b;
+
+	if (c->size < PAGE_SIZE)
+		b = slob_alloc(c->size, flags, c->align);
+	else
+		b = (void *)__get_free_pages(flags, find_order(c->size));
+
+	if (c->ctor)
+		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+	return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+	if (c->dtor)
+		c->dtor(b, c, 0);
+
+	if (c->size < PAGE_SIZE)
+		slob_free(b, c->size);
+	else
+		free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+	return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+	return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+	if (p)
+		free_page((unsigned long)p);
+
+	mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2..0a51f36ba3a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
  */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
 
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef..bc2442a7b0e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,18 +156,50 @@ void fastcall lru_cache_add_active(struct page *page)
 	put_cpu_var(lru_add_active_pvecs);
 }
 
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
 
+	/* CPU is dead, so no locking needed. */
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	pvec = &per_cpu(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_pvecs);
 }
 
+void lru_add_drain(void)
+{
+	__lru_add_drain(get_cpu());
+	put_cpu();
+}
+
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+
+#else
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+	lru_add_drain();
+	return 0;
+}
+#endif
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -378,6 +410,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 	return pagevec_count(pvec);
 }
 
+EXPORT_SYMBOL(pagevec_lookup);
+
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t *index, int tag, unsigned nr_pages)
 {
@@ -412,17 +446,6 @@ void vm_acct_memory(long pages)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
-	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-
-	/* CPU is dead, so no locking needed. */
-	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
-}
 
 /* Drop the CPU's cached committed space back into the central pool. */
 static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +458,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 	if (action == CPU_DEAD) {
 		atomic_add(*committed, &vm_committed_space);
 		*committed = 0;
-		lru_drain_cache((long)hcpu);
+		__lru_add_drain((long)hcpu);
 	}
 	return NOTIFY_OK;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de..7b09ac503fe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 
 #include <asm/pgtable.h>
 
@@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
 	swp_entry_t entry;
 	int err;
@@ -165,7 +166,7 @@ int add_to_swap(struct page * page)
 		 * Add it to the swap cache and mark it dirty
 		 */
 		err = __add_to_swap_cache(page, entry,
-				GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
 		switch (err) {
 		case 0:				/* Success */
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
  */
 void free_pages_and_swap_cache(struct page **pages, int nr)
 {
-	int chunk = 16;
 	struct page **pagep = pages;
 
 	lru_add_drain();
 	while (nr) {
-		int todo = min(chunk, nr);
+		int todo = min(nr, PAGEVEC_SIZE);
 		int i;
 
 		for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301..f1e69c30d20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,8 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
 #include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
@@ -45,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-static DECLARE_MUTEX(swapon_sem);
+static DEFINE_MUTEX(swapon_mutex);
 
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
  * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a semaphore.
+ * cannot be turned into a mutex.
  */
 static DECLARE_RWSEM(swap_unplug_sem);
 
@@ -211,6 +213,26 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
+swp_entry_t get_swap_page_of_type(int type)
+{
+	struct swap_info_struct *si;
+	pgoff_t offset;
+
+	spin_lock(&swap_lock);
+	si = swap_info + type;
+	if (si->flags & SWP_WRITEOK) {
+		nr_swap_pages--;
+		offset = scan_swap_map(si);
+		if (offset) {
+			spin_unlock(&swap_lock);
+			return swp_entry(type, offset);
+		}
+		nr_swap_pages++;
+	}
+	spin_unlock(&swap_lock);
+	return (swp_entry_t) {0};
+}
+
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct * p;
@@ -1140,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	up_write(&swap_unplug_sem);
 
 	destroy_swap_extents(p);
-	down(&swapon_sem);
+	mutex_lock(&swapon_mutex);
 	spin_lock(&swap_lock);
 	drain_mmlist();
 
@@ -1159,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	p->swap_map = NULL;
 	p->flags = 0;
 	spin_unlock(&swap_lock);
-	up(&swapon_sem);
+	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
@@ -1167,9 +1189,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		set_blocksize(bdev, p->old_block_size);
 		bd_release(bdev);
 	} else {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		inode->i_flags &= ~S_SWAPFILE;
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	filp_close(swap_file, NULL);
 	err = 0;
@@ -1188,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
 	int i;
 	loff_t l = *pos;
 
-	down(&swapon_sem);
+	mutex_lock(&swapon_mutex);
 
 	for (i = 0; i < nr_swapfiles; i++, ptr++) {
 		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1217,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
 
 static void swap_stop(struct seq_file *swap, void *v)
 {
-	up(&swapon_sem);
+	mutex_unlock(&swapon_mutex);
 }
 
 static int swap_show(struct seq_file *swap, void *v)
@@ -1386,7 +1408,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->bdev = bdev;
 	} else if (S_ISREG(inode->i_mode)) {
 		p->bdev = inode->i_sb->s_bdev;
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		did_down = 1;
 		if (IS_SWAPFILE(inode)) {
 			error = -EBUSY;
@@ -1422,7 +1444,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
 		swap_header_version = 2;
 	else {
-		printk("Unable to find swap-space signature\n");
+		printk(KERN_ERR "Unable to find swap-space signature\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
@@ -1473,7 +1495,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 			goto bad_swap;
 		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 			goto bad_swap;
-		
+
 		/* OK, set up the swap map and apply the bad block list */
 		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
 			error = -ENOMEM;
@@ -1482,17 +1504,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 
 		error = 0;
 		memset(p->swap_map, 0, maxpages * sizeof(short));
-		for (i=0; i<swap_header->info.nr_badpages; i++) {
-			int page = swap_header->info.badpages[i];
-			if (page <= 0 || page >= swap_header->info.last_page)
+		for (i = 0; i < swap_header->info.nr_badpages; i++) {
+			int page_nr = swap_header->info.badpages[i];
+			if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
 				error = -EINVAL;
 			else
-				p->swap_map[page] = SWAP_MAP_BAD;
+				p->swap_map[page_nr] = SWAP_MAP_BAD;
 		}
 		nr_good_pages = swap_header->info.last_page -
 				swap_header->info.nr_badpages -
 				1 /* header page */;
-		if (error) 
+		if (error)
 			goto bad_swap;
 	}
 
@@ -1519,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		goto bad_swap;
 	}
 
-	down(&swapon_sem);
+	mutex_lock(&swapon_mutex);
 	spin_lock(&swap_lock);
 	p->flags = SWP_ACTIVE;
 	nr_swap_pages += nr_good_pages;
@@ -1545,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		swap_info[prev].next = p - swap_info;
 	}
 	spin_unlock(&swap_lock);
-	up(&swapon_sem);
+	mutex_unlock(&swapon_mutex);
 	error = 0;
 	goto out;
 bad_swap:
@@ -1576,7 +1598,7 @@ out:
 	if (did_down) {
 		if (!error)
 			inode->i_flags |= S_SWAPFILE;
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return error;
 }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed..f9d6a9cc91c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 		goto close_file;
 
 	d_instantiate(dentry, inode);
-	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
+
 	file->f_vfsmnt = mntget(shm_mnt);
 	file->f_dentry = dentry;
 	file->f_mapping = inode->i_mapping;
 	file->f_op = &ramfs_file_operations;
 	file->f_mode = FMODE_WRITE | FMODE_READ;
+
+	/* notify everyone as to the change of file size */
+	error = do_truncate(dentry, size, 0, file);
+	if (error < 0)
+		goto close_file;
+
 	return file;
 
 close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 {
 	return 0;
 }
+
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+#ifndef CONFIG_MMU
+	return ramfs_nommu_mmap(file, vma);
+#else
+	return 0;
+#endif
+}
+
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+				      unsigned long addr,
+				      unsigned long len,
+				      unsigned long pgoff,
+				      unsigned long flags)
+{
+	return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab50060..6cb3fff25f6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 }
 
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
  *
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+				loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 	if (mapping->nrpages == 0)
 		return;
 
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	end = (lend >> PAGE_CACHE_SHIFT);
+
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
 
+			if (page_index > end) {
+				next = page_index;
+				break;
+			}
+
 			if (page_index > next)
 				next = page_index;
 			next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 			next = start;
 			continue;
 		}
+		if (pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
+			if (page->index > end)
+				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
 			if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 		pagevec_release(&pvec);
 	}
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 
 /**
@@ -219,7 +249,6 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		cond_resched();
 	}
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 00000000000..5f4bb59da63
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+	void *ret = kmalloc(size, flags);
+	if (ret)
+		memset(ret, 0, size);
+	return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+	size_t len;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	len = strlen(s) + 1;
+	buf = kmalloc(len, gfp);
+	if (buf)
+		memcpy(buf, s, len);
+	return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de..2e34b61a70c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
 
 	unsigned long nr_mapped;	/* From page_state */
 
-	/* How many pages shrink_cache() should reclaim */
-	int nr_to_reclaim;
-
 	/* Ask shrink_caches, or shrink_zone to scan at this priority */
 	unsigned int priority;
 
@@ -186,8 +183,7 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	int ret = 0;
@@ -275,9 +271,7 @@ static inline int is_page_cache_freeable(struct page *page)
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-	if (current_is_kswapd())
-		return 1;
-	if (current_is_pdflush())	/* This is unlikely, but why not... */
+	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
-		if (res == WRITEPAGE_ACTIVATE) {
+		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
@@ -382,6 +376,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return 0;		/* truncate got there first */
+
+	write_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * The non-racy check for busy page.  It is critical to check
+	 * PageDirty _after_ making sure that the page is freeable and
+	 * not in use by anybody. 	(pagecache + us == 2)
+	 */
+	if (unlikely(page_count(page) != 2))
+		goto cannot_free;
+	smp_rmb();
+	if (unlikely(PageDirty(page)))
+		goto cannot_free;
+
+	if (PageSwapCache(page)) {
+		swp_entry_t swap = { .val = page_private(page) };
+		__delete_from_swap_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		swap_free(swap);
+		__put_page(page);	/* The pagecache ref */
+		return 1;
+	}
+
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	__put_page(page);
+	return 1;
+
+cannot_free:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
@@ -432,7 +463,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!sc->may_swap)
 				goto keep_locked;
-			if (!add_to_swap(page))
+			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
 		}
 #endif /* CONFIG_SWAP */
@@ -515,36 +546,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 				goto free_it;
 		}
 
-		if (!mapping)
-			goto keep_locked;	/* truncate got there first */
-
-		write_lock_irq(&mapping->tree_lock);
-
-		/*
-		 * The non-racy check for busy page.  It is critical to check
-		 * PageDirty _after_ making sure that the page is freeable and
-		 * not in use by anybody. 	(pagecache + us == 2)
-		 */
-		if (unlikely(page_count(page) != 2))
-			goto cannot_free;
-		smp_rmb();
-		if (unlikely(PageDirty(page)))
-			goto cannot_free;
-
-#ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page_private(page) };
-			__delete_from_swap_cache(page);
-			write_unlock_irq(&mapping->tree_lock);
-			swap_free(swap);
-			__put_page(page);	/* The pagecache ref */
-			goto free_it;
-		}
-#endif /* CONFIG_SWAP */
-
-		__remove_from_page_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
-		__put_page(page);
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
 
 free_it:
 		unlock_page(page);
@@ -553,10 +556,6 @@ free_it:
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 
-cannot_free:
-		write_unlock_irq(&mapping->tree_lock);
-		goto keep_locked;
-
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
@@ -574,6 +573,228 @@ keep:
 	return reclaimed;
 }
 
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return -EAGAIN;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
+{
+	int retry;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, from, lru) {
+		cond_resched();
+
+		rc = 0;
+		if (page_count(page) == 1)
+			/* page was freed from under us. So we are done. */
+			goto next;
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
+		 */
+		rc = -EAGAIN;
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto next;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0) {
+			wait_on_page_writeback(page);
+		} else {
+			if (PageWriteback(page))
+				goto unlock_page;
+		}
+
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
+				rc = -ENOMEM;
+				goto unlock_page;
+			}
+		}
+
+		/*
+		 * Page is properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		rc = swap_page(page);
+		goto next;
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			/* Success */
+			list_move(&page->lru, moved);
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return nr_failed + retry;
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list with elevated refcount.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int ret = 0;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (TestClearPageLRU(page)) {
+			ret = 1;
+			get_page(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	return ret;
+}
+#endif
+
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
@@ -653,17 +874,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 			goto done;
 
 		max_scan -= nr_scan;
-		if (current_is_kswapd())
-			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-		else
-			mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		nr_freed = shrink_list(&page_list, sc);
-		if (current_is_kswapd())
-			mod_page_state(kswapd_steal, nr_freed);
-		mod_page_state_zone(zone, pgsteal, nr_freed);
-		sc->nr_to_reclaim -= nr_freed;
 
-		spin_lock_irq(&zone->lru_lock);
+		local_irq_disable();
+		if (current_is_kswapd()) {
+			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+			__mod_page_state(kswapd_steal, nr_freed);
+		} else
+			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
+		__mod_page_state_zone(zone, pgsteal, nr_freed);
+
+		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
@@ -825,11 +1046,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		}
 	}
 	zone->nr_active += pgmoved;
-	spin_unlock_irq(&zone->lru_lock);
-	pagevec_release(&pvec);
+	spin_unlock(&zone->lru_lock);
+
+	__mod_page_state_zone(zone, pgrefill, pgscanned);
+	__mod_page_state(pgdeactivate, pgdeactivate);
+	local_irq_enable();
 
-	mod_page_state_zone(zone, pgrefill, pgscanned);
-	mod_page_state(pgdeactivate, pgdeactivate);
+	pagevec_release(&pvec);
 }
 
 /*
@@ -861,8 +1084,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	else
 		nr_inactive = 0;
 
-	sc->nr_to_reclaim = sc->swap_cluster_max;
-
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			sc->nr_to_scan = min(nr_active,
@@ -876,8 +1097,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= sc->nr_to_scan;
 			shrink_cache(zone, sc);
-			if (sc->nr_to_reclaim <= 0)
-				break;
 		}
 	}
 
@@ -910,7 +1129,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (zone->present_pages == 0)
+		if (!populated_zone(zone))
 			continue;
 
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -1084,7 +1303,7 @@ loop_again:
 			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 				struct zone *zone = pgdat->node_zones + i;
 
-				if (zone->present_pages == 0)
+				if (!populated_zone(zone))
 					continue;
 
 				if (zone->all_unreclaimable &&
@@ -1121,7 +1340,7 @@ scan:
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 
-			if (zone->present_pages == 0)
+			if (!populated_zone(zone))
 				continue;
 
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1238,7 +1457,7 @@ static int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 
 	order = 0;
 	for ( ; ; ) {
@@ -1273,7 +1492,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 {
 	pg_data_t *pgdat;
 
-	if (zone->present_pages == 0)
+	if (!populated_zone(zone))
 		return;
 
 	pgdat = zone->zone_pgdat;
@@ -1354,30 +1573,51 @@ static int __init kswapd_init(void)
 
 module_init(kswapd_init)
 
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
 
 /*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
  * Try to free up some pages from this zone through reclaim.
  */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	struct scan_control sc;
 	int nr_pages = 1 << order;
-	int total_reclaimed = 0;
+	struct task_struct *p = current;
+	struct reclaim_state reclaim_state;
+	struct scan_control sc = {
+		.gfp_mask	= gfp_mask,
+		.may_writepage	= 0,
+		.may_swap	= 0,
+		.nr_mapped	= read_page_state(nr_mapped),
+		.nr_scanned	= 0,
+		.nr_reclaimed	= 0,
+		.priority	= 0
+	};
 
-	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 0;
-	if (zone->all_unreclaimable)
-		return 0;
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->zone_pgdat->node_id != numa_node_id() ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0)
+			return 0;
+
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+			return 0;
 
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = 0;
-	sc.may_swap = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	/* scan at the highest priority */
-	sc.priority = 0;
 	disable_swap_token();
 
 	if (nr_pages > SWAP_CLUSTER_MAX)
@@ -1385,44 +1625,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	else
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
-	/* Don't reclaim the zone if there are other reclaimers active */
-	if (atomic_read(&zone->reclaim_in_progress) > 0)
-		goto out;
-
+	cond_resched();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
 	shrink_zone(zone, &sc);
-	total_reclaimed = sc.nr_reclaimed;
-
- out:
-	return total_reclaimed;
-}
-
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
-				     unsigned int state)
-{
-	struct zone *z;
-	int i;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+	p->reclaim_state = NULL;
+	current->flags &= ~PF_MEMALLOC;
 
-	if (node >= MAX_NUMNODES || !node_online(node))
-		return -EINVAL;
+	if (sc.nr_reclaimed == 0)
+		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	/* This will break if we ever add more zones */
-	if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
-		return -EINVAL;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		if (!(zone & 1<<i))
-			continue;
-
-		z = &NODE_DATA(node)->node_zones[i];
-
-		if (state)
-			z->reclaim_pages = 1;
-		else
-			z->reclaim_pages = 0;
-	}
-
-	return 0;
+	return sc.nr_reclaimed > nr_pages;
 }
+#endif
+
author	Anton Altaparmakov <aia21@cantab.net>	2006-01-19 16:39:33 +0000
committer	Anton Altaparmakov <aia21@cantab.net>	2006-01-19 16:39:33 +0000
commit	944d79559d154c12becde0dab327016cf438f46c (patch)
tree	50c101806f4d3b6585222dda060559eb4f3e005a /mm
parent	d087e4bdd24ebe3ae3d0b265b6573ec901af4b4b (diff)
parent	0f36b018b2e314d45af86449f1a97facb1fbe300 (diff)