From dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:50 -0800
Subject: [PATCH] Swap Migration V5: MPOL_MF_MOVE interface

Add page migration support via swap to the NUMA policy layer

This patch adds page migration support to the NUMA policy layer.  An
additional flag MPOL_MF_MOVE is introduced for mbind.  If MPOL_MF_MOVE is
specified then pages that do not conform to the memory policy will be evicted
from memory.  When they get pages back in new pages will be allocated
following the numa policy.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 135 insertions(+), 20 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952..9cc6d962831 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,14 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+/* Internal MPOL_MF_xxx flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+			struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int rc = 1;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared by others and not writable.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+	    mapping_writably_mapped(page->mapping) ||
+	    single_mm_mapping(vma->vm_mm, page->mapping)) {
+		int rc = isolate_lru_page(page);
+
+		if (rc == 1)
+			list_add(&page->lru, pagelist);
+		/*
+		 * If the isolate attempt was not successful then we just
+		 * encountered an unswappable page. Something must be wrong.
+	 	 */
+		WARN_ON(rc == 0);
+	}
+}
+
 /* Ensure all existing pages follow the policy. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes))
-			break;
+		if (!node_isset(nid, *nodes)) {
+			if (pagelist)
+				migrate_page_add(vma, page, pagelist, flags);
+			else
+				break;
+		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
 }
 
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pgd_range(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (
+		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+		return 0;
+	return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    nodemask_t *nodes, unsigned long flags)
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-		if (!vma->vm_next && vma->vm_end < end)
-			return ERR_PTR(-EFAULT);
-		if (prev && prev->vm_end < vma->vm_start)
-			return ERR_PTR(-EFAULT);
-		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+			if (!vma->vm_next && vma->vm_end < end)
+				return ERR_PTR(-EFAULT);
+			if (prev && prev->vm_end < vma->vm_start)
+				return ERR_PTR(-EFAULT);
+		}
+		if (!is_vm_hugetlb_page(vma) &&
+		    ((flags & MPOL_MF_STRICT) ||
+		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+				vma_migratable(vma)))) {
 			unsigned long endvma = vma->vm_end;
+
 			if (endvma > end)
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma, start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes,
+						flags, pagelist);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
+
 	if (mode == MPOL_DEFAULT)
 		flags &= ~MPOL_MF_STRICT;
+
 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 	end = start + len;
+
 	if (end < start)
 		return -EINVAL;
 	if (end == start)
 		return 0;
+
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
+
 	new = mpol_new(mode, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags);
+	vma = check_range(mm, start, end, nmask, flags,
+	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
-	if (!IS_ERR(vma))
+	if (!IS_ERR(vma)) {
 		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			migrate_pages(&pagelist, NULL);
+		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
-- 
cgit v1.2.3-70-g09d2


From 39743889aaf76725152f16aa90ca3c45f6d52da3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:51 -0800
Subject: [PATCH] Swap Migration V5: sys_migrate_pages interface

sys_migrate_pages implementation using swap based page migration

This is the original API proposed by Ray Bryant in his posts during the first
half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org.

The intent of sys_migrate is to migrate memory of a process.  A process may
have migrated to another node.  Memory was allocated optimally for the prior
context.  sys_migrate_pages allows to shift the memory to the new node.

sys_migrate_pages is also useful if the processes available memory nodes have
changed through cpuset operations to manually move the processes memory.  Paul
Jackson is working on an automated mechanism that will allow an automatic
migration if the cpuset of a process is changed.  However, a user may decide
to manually control the migration.

This implementation is put into the policy layer since it uses concepts and
functions that are also needed for mbind and friends.  The patch also provides
a do_migrate_pages function that may be useful for cpusets to automatically
move memory.  sys_migrate_pages does not modify policies in contrast to Ray's
implementation.

The current code here is based on the swap based page migration capability and
thus is not able to preserve the physical layout relative to it containing
nodeset (which may be a cpuset).  When direct page migration becomes available
then the implementation needs to be changed to do a isomorphic move of pages
between different nodesets.  The current implementation simply evicts all
pages in source nodeset that are not in the target nodeset.

Patch supports ia64, i386 and x86_64.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/ia64/kernel/entry.S         |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 +-
 include/asm-ia64/unistd.h        |  3 +-
 include/asm-x86_64/ia32_unistd.h |  3 +-
 include/asm-x86_64/unistd.h      |  4 +-
 include/linux/mempolicy.h        |  3 ++
 include/linux/syscalls.h         |  2 +
 kernel/sys_ni.c                  |  1 +
 mm/mempolicy.c                   | 94 +++++++++++++++++++++++++++++++++++++++-
 11 files changed, 111 insertions(+), 5 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index f7ba4acc20e..6ff3e524322 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -293,3 +293,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init
 	.long sys_inotify_add_watch
 	.long sys_inotify_rm_watch
+	.long sys_migrate_pages
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0741b066b98..7a6ffd61378 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1600,5 +1600,6 @@ sys_call_table:
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
+	data8 sys_migrate_pages			// 1280
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index df0773c9bdb..1f0ff5adc80 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,7 @@ ia32_sys_call_table:
 	.quad sys_inotify_init
 	.quad sys_inotify_add_watch
 	.quad sys_inotify_rm_watch
+	.quad sys_migrate_pages
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
 		.quad ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fe38b9a9623..481c3c0ea72 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,9 @@
 #define __NR_inotify_init	291
 #define __NR_inotify_add_watch	292
 #define __NR_inotify_rm_watch	293
+#define __NR_migrate_pages	294
 
-#define NR_syscalls 294
+#define NR_syscalls 295
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 2bf543493cb..962f9bd1bdf 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -269,12 +269,13 @@
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
+#define __NR_migrate_pages		1280
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			256 /* length of syscall table */
+#define NR_syscalls			270 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec3868..e8843362a6c 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
 #define __NR_ia32_inotify_init		291
 #define __NR_ia32_inotify_add_watch	292
 #define __NR_ia32_inotify_rm_watch	293
+#define __NR_ia32_migrate_pages		294
 
-#define IA32_NR_syscalls 294	/* must be > than biggest syscall! */
+#define IA32_NR_syscalls 295	/* must be > than biggest syscall! */
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150bce0..e6f896161c1 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init)
 __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
 #define __NR_inotify_rm_watch	255
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_migrate_pages	256
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
 
-#define __NR_syscall_max __NR_inotify_rm_watch
+#define __NR_syscall_max __NR_migrate_pages
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05443a766cb..3e61e829681 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -162,6 +162,9 @@ static inline void check_highest_zone(int k)
 		policy_zone = k;
 }
 
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
+
 #else
 
 struct mempolicy {};
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1db91..e910d1a481d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 					unsigned long maxnode);
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+			const unsigned long __user *from, const unsigned long __user *to);
 
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2ef..7a8bc7f60d9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9cc6d962831..20d5ad39fa4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	LIST_HEAD(pagelist);
+	int count = 0;
+	nodemask_t nodes;
+
+	nodes_andnot(nodes, *from_nodes, *to_nodes);
+	nodes_complement(nodes, nodes);
+
+	down_read(&mm->mmap_sem);
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+	if (!list_empty(&pagelist)) {
+		migrate_pages(&pagelist, NULL);
+		if (!list_empty(&pagelist))
+			count = putback_lru_pages(&pagelist);
+	}
+	up_read(&mm->mmap_sem);
+	return count;
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
 
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
@@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
+/* Macro needed until Paul implements this function in kernel/cpusets.c */
+#define cpuset_mems_allowed(task) node_online_map
+
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+		const unsigned long __user *old_nodes,
+		const unsigned long __user *new_nodes)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	nodemask_t old;
+	nodemask_t new;
+	nodemask_t task_nodes;
+	int err;
+
+	err = get_nodes(&old, old_nodes, maxnode);
+	if (err)
+		return err;
+
+	err = get_nodes(&new, new_nodes, maxnode);
+	if (err)
+		return err;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser priviledges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+	/* Is the user allowed to access the target nodes? */
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+	mmput(mm);
+	return err;
+}
+
+
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
-- 
cgit v1.2.3-70-g09d2


From d498471133ff1f9586a06820beaeebc575fe2814 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:55 -0800
Subject: [PATCH] SwapMig: Extend parameters for migrate_pages()

Extend the parameters of migrate_pages() to allow the caller control over the
fate of successfully migrated or impossible to migrate pages.

Swap migration and direct migration will have the same interface after this
patch so that patches can be independently applied to the policy layer and the
core migration code.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  3 ++-
 mm/mempolicy.c       | 27 ++++++++++++++++++++++-----
 mm/vmscan.c          | 17 ++++++++---------
 3 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index eb591eaad1b..389d1c382e2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,8 @@ extern int vm_swappiness;
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 20d5ad39fa4..30bdafba52d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -429,6 +429,19 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -481,10 +494,13 @@ long do_mbind(unsigned long start, unsigned long len,
 	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
 		err = mbind_range(vma, start, end, new);
 		if (!list_empty(&pagelist))
-			migrate_pages(&pagelist, NULL);
-		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
 	if (!list_empty(&pagelist))
@@ -635,11 +651,12 @@ int do_migrate_pages(struct mm_struct *mm,
 	down_read(&mm->mmap_sem);
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
 	if (!list_empty(&pagelist)) {
-		migrate_pages(&pagelist, NULL);
-		if (!list_empty(&pagelist))
-			count = putback_lru_pages(&pagelist);
+		count = swap_pages(&pagelist);
+		putback_lru_pages(&pagelist);
 	}
+
 	up_read(&mm->mmap_sem);
 	return count;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 73ba4046ed2..5eecb514cce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -670,10 +670,10 @@ retry:
  * list. The direct migration patchset
  * extends this function to avoid the use of swap.
  */
-int migrate_pages(struct list_head *l, struct list_head *t)
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
 {
 	int retry;
-	LIST_HEAD(failed);
 	int nr_failed = 0;
 	int pass = 0;
 	struct page *page;
@@ -686,12 +686,12 @@ int migrate_pages(struct list_head *l, struct list_head *t)
 redo:
 	retry = 0;
 
-	list_for_each_entry_safe(page, page2, l, lru) {
+	list_for_each_entry_safe(page, page2, from, lru) {
 		cond_resched();
 
 		if (page_count(page) == 1) {
 			/* page was freed from under us. So we are done. */
-			move_to_lru(page);
+			list_move(&page->lru, moved);
 			continue;
 		}
 		/*
@@ -722,7 +722,7 @@ redo:
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
-				list_move(&page->lru, &failed);
+				list_move(&page->lru, failed);
 				nr_failed++;
 				continue;
 			}
@@ -732,8 +732,10 @@ redo:
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (!swap_page(page))
+		if (!swap_page(page)) {
+			list_move(&page->lru, moved);
 			continue;
+		}
 retry_later:
 		retry++;
 	}
@@ -743,9 +745,6 @@ retry_later:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
-	if (!list_empty(&failed))
-		list_splice(&failed, l);
-
 	return nr_failed + retry;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 38e35860dbe6197a4b42eb6e8b47da940b7695dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:01 -0800
Subject: [PATCH] mempolicies: private pointer in check_range and
 MPOL_MF_INVERT

This was was first posted at
http://marc.theaimsgroup.com/?l=linux-mm&m=113149240227584&w=2

(Part of this functionality is also contained in the direct migration
pathset. The functionality here is more generic and independent of that
patchset.)

- Add internal flags MPOL_MF_INVERT to control check_range() behavior.

- Replace the pagelist passed through by check_range by a general
  private pointer that may be used for other purposes.
  (The following patches will use that to merge numa_maps into
  mempolicy.c and to better group the page migration code in
  the policy layer)

- Improve some comments.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 30bdafba52d..270e9a39ec1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -88,8 +88,9 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
-/* Internal MPOL_MF_xxx flags */
+/* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -227,11 +228,11 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	}
 }
 
-/* Ensure all existing pages follow the policy. */
+/* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -248,12 +249,13 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes)) {
-			if (pagelist)
-				migrate_page_add(vma, page, pagelist, flags);
-			else
-				break;
-		}
+		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+			continue;
+
+		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+			migrate_page_add(vma, page, private, flags);
+		else
+			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
@@ -262,7 +264,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -273,7 +275,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
 		if (check_pte_range(vma, pmd, addr, next, nodes,
-				    flags, pagelist))
+				    flags, private))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
@@ -282,7 +284,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -293,7 +295,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		if (check_pmd_range(vma, pud, addr, next, nodes,
-				    flags, pagelist))
+				    flags, private))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
@@ -302,7 +304,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 static inline int check_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -313,7 +315,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		if (check_pud_range(vma, pgd, addr, next, nodes,
-				    flags, pagelist))
+				    flags, private))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
@@ -335,8 +337,7 @@ static inline int vma_migratable(struct vm_area_struct *vma)
  */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		const nodemask_t *nodes, unsigned long flags, void *private)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -363,7 +364,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 			if (vma->vm_start > start)
 				start = vma->vm_start;
 			err = check_pgd_range(vma, start, endvma, nodes,
-						flags, pagelist);
+						flags, private);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -452,7 +453,8 @@ long do_mbind(unsigned long start, unsigned long len,
 	int err;
 	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 	    || mode > MPOL_MAX)
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
@@ -490,8 +492,9 @@ long do_mbind(unsigned long start, unsigned long len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags,
-	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
+	vma = check_range(mm, start, end, nmask,
+			  flags | MPOL_MF_INVERT, &pagelist);
+
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma)) {
 		int nr_failed = 0;
@@ -646,7 +649,6 @@ int do_migrate_pages(struct mm_struct *mm,
 	nodemask_t nodes;
 
 	nodes_andnot(nodes, *from_nodes, *to_nodes);
-	nodes_complement(nodes, nodes);
 
 	down_read(&mm->mmap_sem);
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
-- 
cgit v1.2.3-70-g09d2


From 1a75a6c825c17249ca49f050a872a04ce0997ce3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:02 -0800
Subject: [PATCH] Fold numa_maps into mempolicies.c

First discussed at http://marc.theaimsgroup.com/?t=113149255100001&r=1&w=2

- Use the check_range() in mempolicy.c to gather statistics.

- Improve the numa_maps code in general and fix some comments.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 127 ++----------------------------------------------
 mm/mempolicy.c     | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 142 insertions(+), 123 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 50bd5a8f044..0eaad41f465 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -390,129 +390,12 @@ struct seq_operations proc_pid_smaps_op = {
 };
 
 #ifdef CONFIG_NUMA
-
-struct numa_maps {
-	unsigned long pages;
-	unsigned long anon;
-	unsigned long mapped;
-	unsigned long mapcount_max;
-	unsigned long node[MAX_NUMNODES];
-};
-
-/*
- * Calculate numa node maps for a vma
- */
-static struct numa_maps *get_numa_maps(struct vm_area_struct *vma)
-{
-	int i;
-	struct page *page;
-	unsigned long vaddr;
-	struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
-
-	if (!md)
-		return NULL;
-	md->pages = 0;
-	md->anon = 0;
-	md->mapped = 0;
-	md->mapcount_max = 0;
-	for_each_node(i)
-		md->node[i] =0;
-
- 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
-		page = follow_page(vma, vaddr, 0);
-		if (page) {
-			int count = page_mapcount(page);
-
-			if (count)
-				md->mapped++;
-			if (count > md->mapcount_max)
-				md->mapcount_max = count;
-			md->pages++;
-			if (PageAnon(page))
-				md->anon++;
-			md->node[page_to_nid(page)]++;
-		}
-		cond_resched();
-	}
-	return md;
-}
-
-static int show_numa_map(struct seq_file *m, void *v)
-{
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	struct mempolicy *pol;
-	struct numa_maps *md;
-	struct zone **z;
-	int n;
-	int first;
-
-	if (!vma->vm_mm)
-		return 0;
-
-	md = get_numa_maps(vma);
-	if (!md)
-		return 0;
-
-	seq_printf(m, "%08lx", vma->vm_start);
-	pol = get_vma_policy(task, vma, vma->vm_start);
-	/* Print policy */
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-		seq_printf(m, " prefer=%d", pol->v.preferred_node);
-		break;
-	case MPOL_BIND:
-		seq_printf(m, " bind={");
-		first = 1;
-		for (z = pol->v.zonelist->zones; *z; z++) {
-
-			if (!first)
-				seq_putc(m, ',');
-			else
-				first = 0;
-			seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
-					(*z)->name);
-		}
-		seq_putc(m, '}');
-		break;
-	case MPOL_INTERLEAVE:
-		seq_printf(m, " interleave={");
-		first = 1;
-		for_each_node(n) {
-			if (node_isset(n, pol->v.nodes)) {
-				if (!first)
-					seq_putc(m,',');
-				else
-					first = 0;
-				seq_printf(m, "%d",n);
-			}
-		}
-		seq_putc(m, '}');
-		break;
-	default:
-		seq_printf(m," default");
-		break;
-	}
-	seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
-			md->mapcount_max, md->pages, md->mapped);
-	if (md->anon)
-		seq_printf(m," Anon=%lu",md->anon);
-
-	for_each_online_node(n) {
-		if (md->node[n])
-			seq_printf(m, " N%d=%lu", n, md->node[n]);
-	}
-	seq_putc(m, '\n');
-	kfree(md);
-	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-	return 0;
-}
+extern int show_numa_map(struct seq_file *m, void *v);
 
 struct seq_operations proc_pid_numa_maps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_numa_map
+        .start  = m_start,
+        .next   = m_next,
+        .stop   = m_stop,
+        .show   = show_numa_map
 };
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 270e9a39ec1..44b9d69900b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -84,6 +84,8 @@
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -91,6 +93,7 @@
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -228,6 +231,8 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	}
 }
 
+static void gather_stats(struct page *, void *);
+
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
@@ -252,7 +257,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 			continue;
 
-		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		if (flags & MPOL_MF_STATS)
+			gather_stats(page, private);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
 		else
 			break;
@@ -1460,3 +1467,132 @@ void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
 {
 	rebind_policy(current->mempolicy, old, new);
 }
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+
+static const char *policy_types[] = { "default", "prefer", "bind",
+				      "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	int l;
+	nodemask_t nodes;
+	int mode = pol ? pol->policy : MPOL_DEFAULT;
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		nodes_clear(nodes);
+		break;
+
+	case MPOL_PREFERRED:
+		nodes_clear(nodes);
+		node_set(pol->v.preferred_node, nodes);
+		break;
+
+	case MPOL_BIND:
+		get_zonemask(pol, &nodes);
+		break;
+
+	case MPOL_INTERLEAVE:
+		nodes = pol->v.nodes;
+		break;
+
+	default:
+		BUG();
+		return -EFAULT;
+	}
+
+	l = strlen(policy_types[mode]);
+ 	if (buffer + maxlen < p + l + 1)
+ 		return -ENOSPC;
+
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}
+	return p - buffer;
+}
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long mapped;
+	unsigned long mapcount_max;
+	unsigned long node[MAX_NUMNODES];
+};
+
+static void gather_stats(struct page *page, void *private)
+{
+	struct numa_maps *md = private;
+	int count = page_mapcount(page);
+
+	if (count)
+		md->mapped++;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->pages++;
+
+	if (PageAnon(page))
+		md->anon++;
+
+	md->node[page_to_nid(page)]++;
+	cond_resched();
+}
+
+int show_numa_map(struct seq_file *m, void *v)
+{
+	struct task_struct *task = m->private;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md;
+	int n;
+	char buffer[50];
+
+	if (!vma->vm_mm)
+		return 0;
+
+	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+	if (!md)
+		return 0;
+
+	check_pgd_range(vma, vma->vm_start, vma->vm_end,
+		    &node_online_map, MPOL_MF_STATS, md);
+
+	if (md->pages) {
+		mpol_to_str(buffer, sizeof(buffer),
+			    get_vma_policy(task, vma, vma->vm_start));
+
+		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+			   vma->vm_start, buffer, md->pages,
+			   md->mapped, md->mapcount_max);
+
+		if (md->anon)
+			seq_printf(m," anon=%lu",md->anon);
+
+		for_each_online_node(n)
+			if (md->node[n])
+				seq_printf(m, " N%d=%lu", n, md->node[n]);
+
+		seq_putc(m, '\n');
+	}
+	kfree(md);
+
+	if (m->count < m->size)
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
+}
+
-- 
cgit v1.2.3-70-g09d2


From 132beacf9785d2e6e8aecb59aa078f3ca5668fa6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:01:02 -0800
Subject: [PATCH] Drop page table lock before calling migrate_page_add()

migrate_page_add cannot be called with a spinlock held (calls
isolate_lru_page which calles schedule_on_each_cpu).  Drop ptl lock in
check_pte_range before calling migrate_page_add().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm/mempolicy.c')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 44b9d69900b..4c0510e9e7f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -259,8 +259,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+			spin_unlock(ptl);
 			migrate_page_add(vma, page, private, flags);
+			spin_lock(ptl);
+		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-- 
cgit v1.2.3-70-g09d2


From 48fce3429df84a94766fbbc845fa8450d0715b48 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:03 -0800
Subject: [PATCH] mempolicies: unexport get_vma_policy()

Since the numa_maps functionality is now in mempolicy.c we no longer need to
export get_vma_policy().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 3 ---
 mm/mempolicy.c            | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 66247eff24a..05fddd5bee5 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -144,9 +144,6 @@ void mpol_free_shared_policy(struct shared_policy *p);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
-struct mempolicy *get_vma_policy(struct task_struct *task,
-			struct vm_area_struct *vma, unsigned long addr);
-
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c0510e9e7f..4b077ec6c00 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -935,8 +935,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
 
-- 
cgit v1.2.3-70-g09d2


From 6ce3c4c0ff62ca6391019b7832fb41a7f28b9e26 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:04 -0800
Subject: [PATCH] Move page migration related functions near do_migrate_pages()

Group page migration functions in mempolicy.c

Add a forward declaration for migrate_page_add (like gather_stats()) and use
our new found mobility to group all page migration related function around
do_migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 270 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 138 insertions(+), 132 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4b077ec6c00..7051fe450e9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -183,55 +183,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
-			struct address_space *mapping)
-{
-	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
-	int rc = 1;
-
-	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-		if (mm != vma->vm_mm) {
-			rc = 0;
-			goto out;
-		}
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-		if (mm != vma->vm_mm) {
-			rc = 0;
-			goto out;
-		}
-out:
-	spin_unlock(&mapping->i_mmap_lock);
-	return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
-	struct page *page, struct list_head *pagelist, unsigned long flags)
-{
-	/*
-	 * Avoid migrating a page that is shared by others and not writable.
-	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-	    mapping_writably_mapped(page->mapping) ||
-	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
-			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
-	}
-}
-
 static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags);
 
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -440,90 +394,6 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
-static int swap_pages(struct list_head *pagelist)
-{
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
-	int n;
-
-	n = migrate_pages(pagelist, NULL, &moved, &failed);
-	putback_lru_pages(&failed);
-	putback_lru_pages(&moved);
-
-	return n;
-}
-
-long do_mbind(unsigned long start, unsigned long len,
-		unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
-	struct vm_area_struct *vma;
-	struct mm_struct *mm = current->mm;
-	struct mempolicy *new;
-	unsigned long end;
-	int err;
-	LIST_HEAD(pagelist);
-
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
-				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-	    || mode > MPOL_MAX)
-		return -EINVAL;
-	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	if (start & ~PAGE_MASK)
-		return -EINVAL;
-
-	if (mode == MPOL_DEFAULT)
-		flags &= ~MPOL_MF_STRICT;
-
-	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-	end = start + len;
-
-	if (end < start)
-		return -EINVAL;
-	if (end == start)
-		return 0;
-
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-
-	new = mpol_new(mode, nmask);
-	if (IS_ERR(new))
-		return PTR_ERR(new);
-
-	/*
-	 * If we are using the default policy then operation
-	 * on discontinuous address spaces is okay after all
-	 */
-	if (!new)
-		flags |= MPOL_MF_DISCONTIG_OK;
-
-	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-			mode,nodes_addr(nodes)[0]);
-
-	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask,
-			  flags | MPOL_MF_INVERT, &pagelist);
-
-	err = PTR_ERR(vma);
-	if (!IS_ERR(vma)) {
-		int nr_failed = 0;
-
-		err = mbind_range(vma, start, end, new);
-		if (!list_empty(&pagelist))
-			nr_failed = swap_pages(&pagelist);
-
-		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
-			err = -EIO;
-	}
-	if (!list_empty(&pagelist))
-		putback_lru_pages(&pagelist);
-
-	up_write(&mm->mmap_sem);
-	mpol_free(new);
-	return err;
-}
-
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -643,6 +513,71 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+/*
+ * page migration
+ */
+
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+			struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int rc = 1;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared by others and not writable.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+	    mapping_writably_mapped(page->mapping) ||
+	    single_mm_mapping(vma->vm_mm, page->mapping)) {
+		int rc = isolate_lru_page(page);
+
+		if (rc == 1)
+			list_add(&page->lru, pagelist);
+		/*
+		 * If the isolate attempt was not successful then we just
+		 * encountered an unswappable page. Something must be wrong.
+	 	 */
+		WARN_ON(rc == 0);
+	}
+}
+
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
 /*
  * For now migrate_pages simply swaps out the pages from nodes that are in
  * the source set but not in the target set. In the future, we would
@@ -673,6 +608,77 @@ int do_migrate_pages(struct mm_struct *mm,
 	return count;
 }
 
+long do_mbind(unsigned long start, unsigned long len,
+		unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	struct mempolicy *new;
+	unsigned long end;
+	int err;
+	LIST_HEAD(pagelist);
+
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
+		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+
+	if (mode == MPOL_DEFAULT)
+		flags &= ~MPOL_MF_STRICT;
+
+	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+	end = start + len;
+
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+
+	if (mpol_check_policy(mode, nmask))
+		return -EINVAL;
+
+	new = mpol_new(mode, nmask);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
+	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+			mode,nodes_addr(nodes)[0]);
+
+	down_write(&mm->mmap_sem);
+	vma = check_range(mm, start, end, nmask,
+			  flags | MPOL_MF_INVERT, &pagelist);
+
+	err = PTR_ERR(vma);
+	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
+		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
+	up_write(&mm->mmap_sem);
+	mpol_free(new);
+	return err;
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
-- 
cgit v1.2.3-70-g09d2


From 5966514db662fb24c9bb43226a80106bcffd51f8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:47 -0800
Subject: [PATCH] cpuset: mempolicy one more nodemask conversion

Finish converting mm/mempolicy.c from bitmaps to nodemasks.  The previous
conversion had left one routine using bitmaps, since it involved a
corresponding change to kernel/cpuset.c

Fix that interface by replacing with a simple macro that calls nodes_subset(),
or if !CONFIG_CPUSET, returns (1).

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  5 +++--
 kernel/cpuset.c        | 10 ----------
 mm/mempolicy.c         |  5 ++---
 3 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 6e2deef96b3..8b21786490e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,7 +21,8 @@ extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
+#define cpuset_nodes_subset_current_mems_allowed(nodes) \
+		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
@@ -42,7 +43,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_current_mems_allowed(void) {}
-static inline void cpuset_restrict_to_mems_allowed(unsigned long *nodes) {}
+#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f63383e01ec..6503c6da4c4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1749,16 +1749,6 @@ done:
 		refresh_mems();
 }
 
-/**
- * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
- * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
- */
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
-{
-	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
-							MAX_NUMNODES);
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7051fe450e9..9dea2b8a7d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,10 +387,9 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	/* Update current mems_allowed */
 	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes->bits);
+	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
 }
 
-- 
cgit v1.2.3-70-g09d2


From cf2a473c4089aa41c26f653200673f5a4cc25047 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:54 -0800
Subject: [PATCH] cpuset: combine refresh_mems and update_mems

The important code paths through alloc_pages_current() and alloc_page_vma(),
by which most kernel page allocations go, both called
cpuset_update_current_mems_allowed(), which in turn called refresh_mems().
-Both- of these latter two routines did a tasklock, got the tasks cpuset
pointer, and checked for out of date cpuset->mems_generation.

That was a silly duplication of code and waste of CPU cycles on an important
code path.

Consolidated those two routines into a single routine, called
cpuset_update_task_memory_state(), since it updates more than just
mems_allowed.

Changed all callers of either routine to call the new consolidated routine.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  4 +--
 kernel/cpuset.c        | 95 ++++++++++++++++++++++----------------------------
 mm/mempolicy.c         | 10 +++---
 3 files changed, 48 insertions(+), 61 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 736d73801cb..1feebf16ab0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,7 +20,7 @@ extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_current_mems_allowed(void);
+void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
@@ -51,7 +51,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 }
 
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_current_mems_allowed(void) {}
+static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d9349cc48b9..e9917d71628 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -584,13 +584,26 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_online_map));
 }
 
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
  *
- * Call without callback_sem or task_lock() held.  May be called with
- * or without manage_sem held.  Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
+ *
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Except in early boot or
+ * an exiting task, when tsk->cpuset is NULL, this routine will
+ * acquire task_lock().  We don't need to use task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer.  This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
  *
  * The task_lock() is required to dereference current->cpuset safely.
  * Without it, we could pick up the pointer value of current->cpuset
@@ -605,32 +618,36 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * task has been modifying its cpuset.
  */
 
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
+	struct task_struct *tsk = current;
+	struct cpuset *cs = tsk->cpuset;
 
-	task_lock(current);
-	my_cpusets_mem_gen = current->cpuset->mems_generation;
-	task_unlock(current);
+	if (unlikely(!cs))
+		return;
+
+	task_lock(tsk);
+	my_cpusets_mem_gen = cs->mems_generation;
+	task_unlock(tsk);
 
-	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
-		struct cpuset *cs;
-		nodemask_t oldmem = current->mems_allowed;
+	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
+		nodemask_t oldmem = tsk->mems_allowed;
 		int migrate;
 
 		down(&callback_sem);
-		task_lock(current);
-		cs = current->cpuset;
+		task_lock(tsk);
+		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		migrate = is_memory_migrate(cs);
-		guarantee_online_mems(cs, &current->mems_allowed);
-		current->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(current);
+		guarantee_online_mems(cs, &tsk->mems_allowed);
+		tsk->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(tsk);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed)) {
-			numa_policy_rebind(&oldmem, &current->mems_allowed);
+		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
-				do_migrate_pages(current->mm, &oldmem,
-					&current->mems_allowed,
+				do_migrate_pages(tsk->mm, &oldmem,
+					&tsk->mems_allowed,
 					MPOL_MF_MOVE_ALL);
 			}
 		}
@@ -1630,7 +1647,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 		return -ENOMEM;
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1688,7 +1705,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	/* the vfs holds both inode->i_sem already */
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
 		up(&manage_sem);
 		return -EBUSY;
@@ -1872,36 +1889,6 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/**
- * cpuset_update_current_mems_allowed - update mems parameters to new values
- *
- * If the current tasks cpusets mems_allowed changed behind our backs,
- * update current->mems_allowed and mems_generation to the new value.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Unless exiting, it will acquire
- * task_lock().  Also might acquire callback_sem during call to
- * refresh_mems().
- */
-
-void cpuset_update_current_mems_allowed(void)
-{
-	struct cpuset *cs;
-	int need_to_refresh = 0;
-
-	task_lock(current);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;
-	if (current->cpuset_mems_generation != cs->mems_generation)
-		need_to_refresh = 1;
-done:
-	task_unlock(current);
-	if (need_to_refresh)
-		refresh_mems();
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9dea2b8a7d4..515bfeee027 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,7 +387,7 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
@@ -461,7 +461,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -1089,7 +1089,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1115,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
- *	Don't call cpuset_update_current_mems_allowed() unless
+ *	Don't call cpuset_update_task_memory_state() unless
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
@@ -1124,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	struct mempolicy *pol = current->mempolicy;
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_current_mems_allowed();
+		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt())
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
-- 
cgit v1.2.3-70-g09d2


From 909d75a3b77bdd8baa9429bad3b69a654d2954ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:55 -0800
Subject: [PATCH] cpuset: implement cpuset_mems_allowed

Provide a cpuset_mems_allowed() method, which the sys_migrate_pages() code
needed, to obtain the mems_allowed vector of a cpuset, and replaced the
workaround in sys_migrate_pages() to call this new method.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  8 +++++++-
 kernel/cpuset.c        | 29 ++++++++++++++++++++++++++---
 mm/mempolicy.c         |  3 ---
 3 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1feebf16ab0..37d2dd7ca3e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,8 @@ extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
-extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -50,6 +51,11 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 	return cpu_possible_map;
 }
 
+static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+{
+	return node_possible_map;
+}
+
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e9917d71628..0d0dbbd6560 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1871,14 +1871,14 @@ void cpuset_exit(struct task_struct *tsk)
  * tasks cpuset.
  **/
 
-cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
 	down(&callback_sem);
-	task_lock((struct task_struct *)tsk);
+	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
-	task_unlock((struct task_struct *)tsk);
+	task_unlock(tsk);
 	up(&callback_sem);
 
 	return mask;
@@ -1889,6 +1889,29 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of node_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+	nodemask_t mask;
+
+	down(&callback_sem);
+	task_lock(tsk);
+	guarantee_online_mems(tsk->cpuset, &mask);
+	task_unlock(tsk);
+	up(&callback_sem);
+
+	return mask;
+}
+
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 515bfeee027..34d566ac147 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -772,9 +772,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
-- 
cgit v1.2.3-70-g09d2


From 74cb21553f4bf244185b9bec4c26e4e3169ad55e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:56 -0800
Subject: [PATCH] cpuset: numa_policy_rebind cleanup

Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.

The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.

NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.

The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy.  It just looked at the tasks
mems_allowed value.  This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes.  But in an upcoming patch, the vma mempolicies
will be rebound as well.  Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.

So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.

Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 12 ++++++++++--
 kernel/cpuset.c           |  2 +-
 mm/mempolicy.c            | 31 +++++++++++++++++++------------
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05fddd5bee5..74357cb9bc7 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -68,6 +68,7 @@ struct mempolicy {
 		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
+	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
 };
 
 /*
@@ -146,7 +147,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
+extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
+extern void mpol_rebind_task(struct task_struct *tsk,
+					const nodemask_t *new);
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -221,7 +224,12 @@ static inline void numa_default_policy(void)
 {
 }
 
-static inline void numa_policy_rebind(const nodemask_t *old,
+static inline void mpol_rebind_policy(struct mempolicy *pol,
+					const nodemask_t *new)
+{
+}
+
+static inline void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0d0dbbd6560..8f764de3a9e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -643,7 +643,7 @@ void cpuset_update_task_memory_state()
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
-		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		mpol_rebind_task(tsk, &tsk->mems_allowed);
 		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
 				do_migrate_pages(tsk->mm, &oldmem,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 34d566ac147..c39bd86f4ea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -180,6 +180,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 		break;
 	}
 	policy->policy = mode;
+	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
 
@@ -1411,25 +1412,31 @@ void numa_default_policy(void)
 }
 
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-							const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
+	nodemask_t *mpolmask;
 	nodemask_t tmp;
 
 	if (!pol)
 		return;
+	mpolmask = &pol->cpuset_mems_allowed;
+	if (nodes_equal(*mpolmask, *newmask))
+		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
-		current->il_next = node_remap(current->il_next, *old, *new);
+		*mpolmask = *newmask;
+		current->il_next = node_remap(current->il_next,
+						*mpolmask, *newmask);
 		break;
 	case MPOL_PREFERRED:
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-								*old, *new);
+						*mpolmask, *newmask);
+		*mpolmask = *newmask;
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
@@ -1439,7 +1446,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
 			node_set((*z)->zone_pgdat->node_id, nodes);
-		nodes_remap(tmp, nodes, *old, *new);
+		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
 		zonelist = bind_zonelist(&nodes);
@@ -1454,6 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 			kfree(pol->v.zonelist);
 			pol->v.zonelist = zonelist;
 		}
+		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1463,14 +1471,13 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
- *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
  */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	rebind_policy(current->mempolicy, old, new);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4225399a66b315d4d1fb1cb61b75dda201c832e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:59 -0800
Subject: [PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction.

NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset.  The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.

When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.

An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.

This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired.  The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.

Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan.  In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock).  So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.

Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound.  A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.

When a task is moved to a different cpuset, it is easier, as there is only one
task involved.  It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.

It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice.  This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 18 ++++++++++
 kernel/cpuset.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c            | 29 +++++++++++++++
 3 files changed, 137 insertions(+)

(limited to 'mm/mempolicy.c')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc7..c7ac77e873b 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
 extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
+extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
+
+#ifdef CONFIG_CPUSET
+#define current_cpuset_is_being_rebound() \
+				(cpuset_being_rebound == current->cpuset)
+#else
+#define current_cpuset_is_being_rebound() 0
+#endif
+
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
 int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
+extern void *cpuset_being_rebound;	/* Trigger mpol_copy vma rebind */
+
 #else
 
 struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
 {
 }
 
+static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+}
+
+#define set_cpuset_being_rebound(x) do {} while (0)
+
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26e..19f87565be1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 
 /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
  * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
  */
 
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	struct task_struct *g, *p;
+	struct mm_struct **mmarray;
+	int i, n, ntasks;
+	int fudge;
 	int retval;
 
 	trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
 	up(&callback_sem);
 
+	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
+
+	fudge = 10;				/* spare mmarray[] slots */
+	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	retval = -ENOMEM;
+
+	/*
+	 * Allocate mmarray[] to hold mm reference for each task
+	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
+	 * few more lines of code, we can retry until we get a big
+	 * enough mmarray[] w/o using GFP_ATOMIC.
+	 */
+	while (1) {
+		ntasks = atomic_read(&cs->count);	/* guess */
+		ntasks += fudge;
+		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		if (!mmarray)
+			goto done;
+		write_lock_irq(&tasklist_lock);		/* block fork */
+		if (atomic_read(&cs->count) <= ntasks)
+			break;				/* got enough */
+		write_unlock_irq(&tasklist_lock);	/* try again */
+		kfree(mmarray);
+	}
+
+	n = 0;
+
+	/* Load up mmarray[] with mm reference for each task in cpuset. */
+	do_each_thread(g, p) {
+		struct mm_struct *mm;
+
+		if (n >= ntasks) {
+			printk(KERN_WARNING
+				"Cpuset mempolicy rebind incomplete.\n");
+			continue;
+		}
+		if (p->cpuset != cs)
+			continue;
+		mm = get_task_mm(p);
+		if (!mm)
+			continue;
+		mmarray[n++] = mm;
+	} while_each_thread(g, p);
+	write_unlock_irq(&tasklist_lock);
+
+	/*
+	 * Now that we've dropped the tasklist spinlock, we can
+	 * rebind the vma mempolicies of each mm in mmarray[] to their
+	 * new cpuset, and release that mm.  The mpol_rebind_mm()
+	 * call takes mmap_sem, which we couldn't take while holding
+	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
+	 * cpuset_being_rebound check will catch such forks, and rebind
+	 * their vma mempolicies too.  Because we still hold the global
+	 * cpuset manage_sem, we know that no other rebind effort will
+	 * be contending for the global variable cpuset_being_rebound.
+	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+	 * is idempotent.
+	 */
+	for (i = 0; i < n; i++) {
+		struct mm_struct *mm = mmarray[i];
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		mmput(mm);
+	}
+
+	/* We're done rebinding vma's to this cpusets new mems_allowed. */
+	kfree(mmarray);
+	set_cpuset_being_rebound(NULL);
+	retval = 0;
 done:
 	return retval;
 }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 	nodemask_t from, to;
+	struct mm_struct *mm;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	to = cs->mems_allowed;
 
 	up(&callback_sem);
+
+	mm = get_task_mm(tsk);
+	if (mm) {
+		mpol_rebind_mm(mm, &to);
+		mmput(mm);
+	}
+
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c39bd86f4ea..1850d0aef4a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+	if (current_cpuset_is_being_rebound()) {
+		nodemask_t mems = cpuset_mems_allowed(current);
+		mpol_rebind_policy(old, &mems);
+	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
@@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		mpol_rebind_policy(vma->vm_policy, new);
+	up_write(&mm->mmap_sem);
+}
+
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-- 
cgit v1.2.3-70-g09d2