21 files changed, 328 insertions, 100 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d213feded10..50353e0dac1 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -29,9 +29,7 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-EXPORT_SYMBOL(max_pfn);		/* This is exported so
-				 * dma_get_required_mask(), which uses
-				 * it, can be an inline function */
+EXPORT_UNUSED_SYMBOL(max_pfn);  /*  June 2006  */
 
 static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0a03357a1f8..168c78a121b 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -23,18 +23,6 @@
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
- *
- * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
- * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
- * under writeout are skipped, whether or not they are dirty.
- *
- * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
- * offsets `offset' and `offset+len'.
- *
- * By combining these two operations the application may do several things:
- *
- * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
- *
  */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
@@ -85,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 		file->f_ra.ra_pages = bdi->ra_pages * 2;
 		break;
 	case POSIX_FADV_WILLNEED:
-	case POSIX_FADV_NOREUSE:
 		if (!mapping->a_ops->readpage) {
 			ret = -EINVAL;
 			break;
@@ -106,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 		if (ret > 0)
 			ret = 0;
 		break;
+	case POSIX_FADV_NOREUSE:
+		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
 			filemap_flush(mapping);
diff --git a/mm/filemap.c b/mm/filemap.c
index d087fc3d328..b9a60c43b61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp,
 		return;
 
 	ra->ra_pages /= 4;
-	printk(KERN_WARNING "Reducing readahead size to %luK\n",
-			ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
 }
 
 /**
diff --git a/mm/memory.c b/mm/memory.c
index 7e2a4b1580e..109e9866237 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,6 +47,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
+#include <linux/delayacct.h>
 #include <linux/init.h>
 
 #include <asm/pgalloc.h>
@@ -503,7 +504,7 @@ again:
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
-	spin_lock(src_ptl);
+	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 
 	do {
 		/*
@@ -1549,9 +1550,9 @@ gotten:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		lazy_mmu_prot_update(entry);
 		ptep_establish(vma, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
-		lazy_mmu_prot_update(entry);
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
@@ -1853,7 +1854,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 
 	return 0;
 }
-EXPORT_SYMBOL(vmtruncate_range);
+EXPORT_UNUSED_SYMBOL(vmtruncate_range);  /*  June 2006  */
 
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
@@ -1934,6 +1935,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		migration_entry_wait(mm, pmd, address);
 		goto out;
 	}
+	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1946,6 +1948,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
+			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 			goto unlock;
 		}
 
@@ -1955,6 +1958,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		grab_swap_token();
 	}
 
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	mark_page_accessed(page);
 	lock_page(page);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 01c9fb97c61..c37319542b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
 
+	if (pfn_valid(phys_start_pfn))
+		return -EEXIST;
+
 	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
 
 	if (ret < 0)
@@ -76,15 +79,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 {
 	unsigned long i;
 	int err = 0;
+	int start_sec, end_sec;
+	/* during initialize mem_map, align hot-added range to section */
+	start_sec = pfn_to_section_nr(phys_start_pfn);
+	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 
-	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
-		err = __add_section(zone, phys_start_pfn + i);
+	for (i = start_sec; i <= end_sec; i++) {
+		err = __add_section(zone, i << PFN_SECTION_SHIFT);
 
-		/* We want to keep adding the rest of the
-		 * sections if the first ones already exist
+		/*
+		 * EEXIST is finally dealed with by ioresource collision
+		 * check. see add_memory() => register_memory_resource()
+		 * Warning will be printed if there is collision.
 		 */
 		if (err && (err != -EEXIST))
 			break;
+		err = 0;
 	}
 
 	return err;
@@ -156,7 +166,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	res.flags = IORESOURCE_MEM; /* we just need system ram */
 	section_end = res.end;
 
-	while (find_next_system_ram(&res) >= 0) {
+	while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
 		start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
 		nr_pages = (unsigned long)
                            ((res.end + 1 - res.start) >> PAGE_SHIFT);
@@ -213,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 /* add this memory to iomem resource */
-static void register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size)
 {
 	struct resource *res;
-
 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	BUG_ON(!res);
 
@@ -228,7 +237,18 @@ static void register_memory_resource(u64 start, u64 size)
 		printk("System RAM resource %llx - %llx cannot be added\n",
 		(unsigned long long)res->start, (unsigned long long)res->end);
 		kfree(res);
+		res = NULL;
 	}
+	return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+	if (!res)
+		return;
+	release_resource(res);
+	kfree(res);
+	return;
 }
 
 
@@ -237,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
 	int new_pgdat = 0;
+	struct resource *res;
 	int ret;
 
+	res = register_memory_resource(start, size);
+	if (!res)
+		return -EEXIST;
+
 	if (!node_online(nid)) {
 		pgdat = hotadd_new_pgdat(nid, start);
 		if (!pgdat)
@@ -268,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size)
 		BUG_ON(ret);
 	}
 
-	/* register this memory as resource */
-	register_memory_resource(start, size);
-
 	return ret;
 error:
 	/* rollback pgdat allocation and others */
 	if (new_pgdat)
 		rollback_node_hotadd(nid, pgdat);
+	if (res)
+		release_memory_resource(res);
 
 	return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e07e27e846a..a9963ceddd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 	if (vma) {
 		unsigned long off;
 
-		off = vma->vm_pgoff;
+		/*
+		 * for small pages, there is no difference between
+		 * shift and PAGE_SHIFT, so the bit-shift is safe.
+		 * for huge pages, since vm_pgoff is in units of small
+		 * pages, we need to shift off the always 0 bits to get
+		 * a useful offset.
+		 */
+		BUG_ON(shift < PAGE_SHIFT);
+		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
 		off += (addr - vma->vm_start) >> shift;
 		return offset_il_node(pol, vma, off);
 	} else
diff --git a/mm/mempool.c b/mm/mempool.c
index fe6e05289cc..ccd8cb8cd41 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
 	init_wait(&wait);
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	smp_mb();
-	if (!pool->curr_nr)
-		io_schedule();
+	if (!pool->curr_nr) {
+		/*
+		 * FIXME: this should be io_schedule().  The timeout is there
+		 * as a workaround for some DM problems in 2.6.18.
+		 */
+		io_schedule_timeout(5*HZ);
+	}
 	finish_wait(&pool->wait, &wait);
 
 	goto repeat_alloc;
diff --git a/mm/mmap.c b/mm/mmap.c
index c1868ecdbc5..e66a0b524af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,10 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 
+#ifndef arch_mmap_check
+#define arch_mmap_check(addr, len, flags)	(0)
+#endif
+
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
@@ -913,6 +917,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	if (!len)
 		return -EINVAL;
 
+	error = arch_mmap_check(addr, len, flags);
+	if (error)
+		return error;
+
 	/* Careful about overflows.. */
 	len = PAGE_ALIGN(len);
 	if (!len || len > TASK_SIZE)
@@ -1859,6 +1867,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	unsigned long flags;
 	struct rb_node ** rb_link, * rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
+	int error;
 
 	len = PAGE_ALIGN(len);
 	if (!len)
@@ -1867,6 +1876,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 		return -EINVAL;
 
+	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+	error = arch_mmap_check(addr, len, flags);
+	if (error)
+		return error;
+
 	/*
 	 * mlock MCL_FUTURE?
 	 */
@@ -1907,8 +1922,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 		return -ENOMEM;
 
-	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
 	/* Can we just expand an old private anonymous mapping? */
 	if (vma_merge(mm, prev, addr, addr + len, flags,
 					NULL, NULL, pgoff, NULL))
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 0959ee1a479..febea1c9816 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,7 +14,7 @@ struct pglist_data *first_online_pgdat(void)
 	return NODE_DATA(first_online_node);
 }
 
-EXPORT_SYMBOL(first_online_pgdat);
+EXPORT_UNUSED_SYMBOL(first_online_pgdat);  /*  June 2006  */
 
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
@@ -24,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 		return NULL;
 	return NODE_DATA(nid);
 }
-EXPORT_SYMBOL(next_online_pgdat);
+EXPORT_UNUSED_SYMBOL(next_online_pgdat);  /*  June 2006  */
 
 
 /*
@@ -45,5 +45,5 @@ struct zone *next_zone(struct zone *zone)
 	}
 	return zone;
 }
-EXPORT_SYMBOL(next_zone);
+EXPORT_UNUSED_SYMBOL(next_zone);  /*  June 2006  */
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 1903bdf65e4..7c15cf3373a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -97,7 +97,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
  	new_pte = pte_offset_map_nested(new_pmd, new_addr);
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
-		spin_lock(new_ptl);
+		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 5151c44a825..c576df71e3b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1070,6 +1070,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
 	return 0;
 }
+EXPORT_SYMBOL(remap_pfn_range);
 
 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
@@ -1090,6 +1091,7 @@ void unmap_mapping_range(struct address_space *mapping,
 			 int even_cows)
 {
 }
+EXPORT_SYMBOL(unmap_mapping_range);
 
 /*
  * Check that a process has enough memory to allocate a new virtual
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d46ed0f1dc0..b9af136e5cf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -225,7 +225,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
  * we select a process with CAP_SYS_RAW_IO set).
  */
-static void __oom_kill_task(task_t *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, const char *message)
 {
 	if (p->pid == 1) {
 		WARN_ON(1);
@@ -255,10 +255,10 @@ static void __oom_kill_task(task_t *p, const char *message)
 	force_sig(SIGKILL, p);
 }
 
-static int oom_kill_task(task_t *p, const char *message)
+static int oom_kill_task(struct task_struct *p, const char *message)
 {
 	struct mm_struct *mm;
-	task_t * g, * q;
+	struct task_struct *g, *q;
 
 	mm = p->mm;
 
@@ -316,7 +316,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-	task_t *p;
+	struct task_struct *p;
 	unsigned long points = 0;
 
 	if (printk_ratelimit()) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e792a583f3..54a4f5375bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
+#ifdef CONFIG_NUMA
+		zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+						/ 100;
+#endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
@@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->min_unmapped_ratio = (zone->present_pages *
+				sysctl_min_unmapped_ratio) / 100;
+	return 0;
+}
+#endif
+
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
diff --git a/mm/slab.c b/mm/slab.c
index 3936af34454..21ba0603570 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -674,6 +674,37 @@ static struct kmem_cache cache_cache = {
 #endif
 };
 
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Slab sometimes uses the kmalloc slabs to store the slab headers
+ * for other slabs "off slab".
+ * The locking for this is tricky in that it nests within the locks
+ * of all other slabs in a few places; to deal with this special
+ * locking we put on-slab caches into a separate lock-class.
+ */
+static struct lock_class_key on_slab_key;
+
+static inline void init_lock_keys(struct cache_sizes *s)
+{
+	int q;
+
+	for (q = 0; q < MAX_NUMNODES; q++) {
+		if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep))
+			continue;
+		lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock,
+				  &on_slab_key);
+	}
+}
+
+#else
+static inline void init_lock_keys(struct cache_sizes *s)
+{
+}
+#endif
+
+
+
 /* Guard access to the cache-chain. */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
@@ -1075,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-static int __devinit cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1272,6 +1303,11 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 
 	local_irq_disable();
 	memcpy(ptr, list, sizeof(struct kmem_list3));
+	/*
+	 * Do not assume that spinlocks can be initialized via memcpy:
+	 */
+	spin_lock_init(&ptr->list_lock);
+
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
 	local_irq_enable();
@@ -1386,6 +1422,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL, NULL);
 		}
+		init_lock_keys(sizes);
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
@@ -1398,7 +1435,7 @@ void __init kmem_cache_init(void)
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
-		void *ptr;
+		struct array_cache *ptr;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
@@ -1406,6 +1443,11 @@ void __init kmem_cache_init(void)
 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
+		/*
+		 * Do not assume that spinlocks can be initialized via memcpy:
+		 */
+		spin_lock_init(&ptr->lock);
+
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
 
@@ -1416,6 +1458,11 @@ void __init kmem_cache_init(void)
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
 		       sizeof(struct arraycache_init));
+		/*
+		 * Do not assume that spinlocks can be initialized via memcpy:
+		 */
+		spin_lock_init(&ptr->lock);
+
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
 		local_irq_enable();
@@ -3177,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 
 /**
- * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
  * @cache: The cache to allocate from.
  * @flags: See kmalloc().
  *
diff --git a/mm/swap.c b/mm/swap.c
index 8fd095c4ae5..687686a61f7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,6 +54,26 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
 
+/**
+ * put_pages_list(): release a list of pages
+ *
+ * Release a list of pages which are strung together on page.lru.  Currently
+ * used by read_cache_pages() and related error recovery code.
+ *
+ * @pages: list of pages threaded on page->lru
+ */
+void put_pages_list(struct list_head *pages)
+{
+	while (!list_empty(pages)) {
+		struct page *victim;
+
+		victim = list_entry(pages->prev, struct page, lru);
+		list_del(&victim->lru);
+		page_cache_release(victim);
+	}
+}
+EXPORT_SYMBOL(put_pages_list);
+
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fccbd9bba77..5f7cf2a4cb5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -38,7 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= RW_LOCK_UNLOCKED,
+	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e70d6c6d6fe..f1f5ec78378 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
 
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
+
 		if (!device) {
 			spin_unlock(&swap_lock);
 			return i;
 		}
-		inode = swap_info->swap_file->f_dentry->d_inode;
+		inode = swap_info[i].swap_file->f_dentry->d_inode;
 		if (S_ISBLK(inode->i_mode) &&
 		    device == MKDEV(imajor(inode), iminor(inode))) {
 			spin_unlock(&swap_lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index cf1b015df4a..c6ab55ec688 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -68,10 +68,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 		return 0;
 
 	write_lock_irq(&mapping->tree_lock);
-	if (PageDirty(page)) {
-		write_unlock_irq(&mapping->tree_lock);
-		return 0;
-	}
+	if (PageDirty(page))
+		goto failed;
+	if (page_count(page) != 2)	/* caller's ref + pagecache ref */
+		goto failed;
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
@@ -79,6 +79,9 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
+failed:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
 }
 
 /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f8553f893..266162d2ba2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -330,6 +330,8 @@ void __vunmap(void *addr, int deallocate_pages)
 		return;
 	}
 
+	debug_check_no_locks_freed(addr, area->size);
+
 	if (deallocate_pages) {
 		int i;
 
@@ -338,7 +340,7 @@ void __vunmap(void *addr, int deallocate_pages)
 			__free_page(area->pages[i]);
 		}
 
-		if (area->nr_pages > PAGE_SIZE/sizeof(struct page *))
+		if (area->flags & VM_VPAGES)
 			vfree(area->pages);
 		else
 			kfree(area->pages);
@@ -425,9 +427,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
-	if (array_size > PAGE_SIZE)
+	if (array_size > PAGE_SIZE) {
 		pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
-	else
+		area->flags |= VM_VPAGES;
+	} else
 		pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
 	area->pages = pages;
 	if (!area->pages) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ff2ebe9458a..5d4c4d02254 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1503,10 +1503,6 @@ module_init(kswapd_init)
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
- *
- * In the future we may add flags to the mode. However, the page allocator
- * should only have to check that zone_reclaim_mode != 0 before calling
- * zone_reclaim().
  */
 int zone_reclaim_mode __read_mostly;
 
@@ -1524,6 +1520,12 @@ int zone_reclaim_mode __read_mostly;
 #define ZONE_RECLAIM_PRIORITY 4
 
 /*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1590,18 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	/*
-	 * Do not reclaim if there are not enough reclaimable pages in this
-	 * zone that would satify this allocations.
+	 * Zone reclaim reclaims unmapped file backed pages.
 	 *
-	 * All unmapped pagecache pages are reclaimable.
-	 *
-	 * Both counters may be temporarily off a bit so we use
-	 * SWAP_CLUSTER_MAX as the boundary. It may also be good to
-	 * leave a few frequently used unmapped pagecache pages around.
+	 * A small portion of unmapped file backed pages is needed for
+	 * file I/O otherwise pages read by file I/O will be immediately
+	 * thrown out if the zone is overallocated. So we do not reclaim
+	 * if less than a specified percentage of the zone is used by
+	 * unmapped file backed pages.
 	 */
 	if (zone_page_state(zone, NR_FILE_PAGES) -
-		zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
-			return 0;
+	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+		return 0;
 
 	/*
 	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 73b83d67bab..c1b5f4106b3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
@@ -81,6 +82,7 @@ void all_vm_events(unsigned long *ret)
 {
 	sum_vm_events(ret, &cpu_online_map);
 }
+EXPORT_SYMBOL_GPL(all_vm_events);
 
 #ifdef CONFIG_HOTPLUG
 /*
@@ -113,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
 
 #ifdef CONFIG_SMP
 
-#define STAT_THRESHOLD 32
+static int calculate_threshold(struct zone *zone)
+{
+	int threshold;
+	int mem;	/* memory in 128 MB units */
+
+	/*
+	 * The threshold scales with the number of processors and the amount
+	 * of memory per zone. More memory means that we can defer updates for
+	 * longer, more processors could lead to more contention.
+ 	 * fls() is used to have a cheap way of logarithmic scaling.
+	 *
+	 * Some sample thresholds:
+	 *
+	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
+	 * ------------------------------------------------------------------
+	 * 8		1		1	0.9-1 GB	4
+	 * 16		2		2	0.9-1 GB	4
+	 * 20 		2		2	1-2 GB		5
+	 * 24		2		2	2-4 GB		6
+	 * 28		2		2	4-8 GB		7
+	 * 32		2		2	8-16 GB		8
+	 * 4		2		2	<128M		1
+	 * 30		4		3	2-4 GB		5
+	 * 48		4		3	8-16 GB		8
+	 * 32		8		4	1-2 GB		4
+	 * 32		8		4	0.9-1GB		4
+	 * 10		16		5	<128M		1
+	 * 40		16		5	900M		4
+	 * 70		64		7	2-4 GB		5
+	 * 84		64		7	4-8 GB		6
+	 * 108		512		9	4-8 GB		6
+	 * 125		1024		10	8-16 GB		8
+	 * 125		1024		10	16-32 GB	9
+	 */
+
+	mem = zone->present_pages >> (27 - PAGE_SHIFT);
+
+	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
+
+	/*
+	 * Maximum threshold is 125
+	 */
+	threshold = min(125, threshold);
+
+	return threshold;
+}
 
 /*
- * Determine pointer to currently valid differential byte given a zone and
- * the item number.
- *
- * Preemption must be off
+ * Refresh the thresholds for each zone.
  */
-static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+static void refresh_zone_stat_thresholds(void)
 {
-	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+	struct zone *zone;
+	int cpu;
+	int threshold;
+
+	for_each_zone(zone) {
+
+		if (!zone->present_pages)
+			continue;
+
+		threshold = calculate_threshold(zone);
+
+		for_each_online_cpu(cpu)
+			zone_pcp(zone, cpu)->stat_threshold = threshold;
+	}
 }
 
 /*
@@ -132,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	s8 *p;
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = pcp->vm_stat_diff + item;
 	long x;
 
-	p = diff_pointer(zone, item);
 	x = delta + *p;
 
-	if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
 		zone_page_state_add(x, zone, item);
 		x = 0;
 	}
-
 	*p = x;
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
@@ -171,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
  * No overflow check is necessary and therefore the differential can be
  * incremented or decremented in place which may allow the compilers to
  * generate better code.
- *
  * The increment or decrement is known and therefore one boundary check can
  * be omitted.
  *
+ * NOTE: These functions are very performance sensitive. Change only
+ * with care.
+ *
  * Some processors have inc/dec instructions that are atomic vs an interrupt.
  * However, the code must first determine the differential location in a zone
  * based on the processor number and then inc/dec the counter. There is no
@@ -184,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
  */
 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	s8 *p = diff_pointer(zone, item);
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = pcp->vm_stat_diff + item;
 
 	(*p)++;
 
-	if (unlikely(*p > STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
+	if (unlikely(*p > pcp->stat_threshold)) {
+		int overstep = pcp->stat_threshold / 2;
+
+		zone_page_state_add(*p + overstep, zone, item);
+		*p = -overstep;
 	}
 }
 
@@ -203,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	struct zone *zone = page_zone(page);
-	s8 *p = diff_pointer(zone, item);
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = pcp->vm_stat_diff + item;
 
 	(*p)--;
 
-	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
+	if (unlikely(*p < - pcp->stat_threshold)) {
+		int overstep = pcp->stat_threshold / 2;
+
+		zone_page_state_add(*p - overstep, zone, item);
+		*p = overstep;
 	}
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
@@ -238,19 +302,9 @@ EXPORT_SYMBOL(inc_zone_page_state);
 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	unsigned long flags;
-	struct zone *zone;
-	s8 *p;
 
-	zone = page_zone(page);
 	local_irq_save(flags);
-	p = diff_pointer(zone, item);
-
-	(*p)--;
-
-	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
-	}
+	__dec_zone_page_state(page, item);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
@@ -524,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
+#ifdef CONFIG_SMP
+			seq_printf(m, "\n  vm stats threshold: %d",
+					pageset->stat_threshold);
+#endif
 		}
 		seq_printf(m,
 			   "\n  all_unreclaimable: %u"
@@ -612,3 +670,35 @@ struct seq_operations vmstat_op = {
 
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the thresholds are recalculated
+ * when necessary.
+ */
+static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	switch (action) {
+		case CPU_UP_PREPARE:
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			refresh_zone_stat_thresholds();
+			break;
+		default:
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata vmstat_notifier =
+	{ &vmstat_cpuup_callback, NULL, 0 };
+
+int __init setup_vmstat(void)
+{
+	refresh_zone_stat_thresholds();
+	register_cpu_notifier(&vmstat_notifier);
+	return 0;
+}
+module_init(setup_vmstat)
+#endif