From 802f192e4a600f7ef84ca25c8b818c8830acef5a Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 3 Sep 2005 15:54:26 -0700
Subject: [PATCH] SPARSEMEM EXTREME

A new option for SPARSEMEM is ARCH_SPARSEMEM_EXTREME.  Architecture
platforms with a very sparse physical address space would likely want to
select this option.  For those architecture platforms that don't select the
option, the code generated is equivalent to SPARSEMEM currently in -mm.
I'll be posting a patch on ia64 ml which uses this new SPARSEMEM feature.

ARCH_SPARSEMEM_EXTREME makes mem_section a one dimensional array of
pointers to mem_sections.  This two level layout scheme is able to achieve
smaller memory requirements for SPARSEMEM with the tradeoff of an
additional shift and load when fetching the memory section.  The current
SPARSEMEM -mm implementation is a one dimensional array of mem_sections
which is the default SPARSEMEM configuration.  The patch attempts isolates
the implementation details of the physical layout of the sparsemem section
array.

ARCH_SPARSEMEM_EXTREME depends on 64BIT and is by default boolean false.

I've boot tested under aim load ia64 configured for ARCH_SPARSEMEM_EXTREME.
 I've also boot tested a 4 way Opteron machine with !ARCH_SPARSEMEM_EXTREME
and tested with aim.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig  |  9 +++++++++
 mm/sparse.c | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index cd379936cac..fc644c5c065 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -89,3 +89,12 @@ config NEED_MULTIPLE_NODES
 config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
+
+#
+# Architectecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config ARCH_SPARSEMEM_EXTREME
+	def_bool n
+	depends on SPARSEMEM && 64BIT
diff --git a/mm/sparse.c b/mm/sparse.c
index b54e304df4a..b2b456bf0a5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,7 +13,26 @@
  *
  * 1) mem_section	- memory sections, mem_map's for valid memory
  */
-struct mem_section mem_section[NR_MEM_SECTIONS];
+#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
+struct mem_section *mem_section[NR_SECTION_ROOTS]
+	____cacheline_maxaligned_in_smp;
+
+static void sparse_index_init(unsigned long section, int nid)
+{
+	unsigned long root = SECTION_TO_ROOT(section);
+
+	if (mem_section[root])
+		return;
+	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+	if (mem_section[root])
+		memset(mem_section[root], 0, PAGE_SIZE);
+	else
+		panic("memory_present: NO MEMORY\n");
+}
+#else
+struct mem_section mem_section[NR_MEM_SECTIONS]
+	____cacheline_maxaligned_in_smp;
+#endif
 EXPORT_SYMBOL(mem_section);
 
 /* Record a memory area against a node. */
@@ -24,8 +43,13 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	start &= PAGE_SECTION_MASK;
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 		unsigned long section = pfn_to_section_nr(pfn);
-		if (!mem_section[section].section_mem_map)
-			mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+		struct mem_section *ms;
+
+		sparse_index_init(section, nid);
+
+		ms = __nr_to_section(section);
+		if (!ms->section_mem_map)
+			ms->section_mem_map = SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -85,6 +109,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
 	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+	struct mem_section *ms = __nr_to_section(pnum);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
@@ -96,7 +121,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 		return map;
 
 	printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
-	mem_section[pnum].section_mem_map = 0;
+	ms->section_mem_map = 0;
 	return NULL;
 }
 
@@ -114,8 +139,9 @@ void sparse_init(void)
 			continue;
 
 		map = sparse_early_mem_map_alloc(pnum);
-		if (map)
-			sparse_init_one_section(&mem_section[pnum], pnum, map);
+		if (!map)
+			continue;
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3e347261a80b57df792ab9464b5f0ed59add53a8 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 3 Sep 2005 15:54:28 -0700
Subject: [PATCH] sparsemem extreme implementation

With cleanups from Dave Hansen <haveblue@us.ibm.com>

SPARSEMEM_EXTREME makes mem_section a one dimensional array of pointers to
mem_sections.  This two level layout scheme is able to achieve smaller
memory requirements for SPARSEMEM with the tradeoff of an additional shift
and load when fetching the memory section.  The current SPARSEMEM
implementation is a one dimensional array of mem_sections which is the
default SPARSEMEM configuration.  The patch attempts isolates the
implementation details of the physical layout of the sparsemem section
array.

SPARSEMEM_EXTREME requires bootmem to be functioning at the time of
memory_present() calls.  This is not always feasible, so architectures
which do not need it may allocate everything statically by using
SPARSEMEM_STATIC.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig      |  1 +
 include/linux/mmzone.h | 40 +++++++++++++++-------------------------
 mm/Kconfig             | 19 ++++++++++++++++---
 mm/sparse.c            | 26 +++++++++++++++++---------
 4 files changed, 49 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 619d843ba23..dcb0ad098c6 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -754,6 +754,7 @@ config NUMA
 	depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
+	select SPARSEMEM_STATIC
 
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b97054bbc39..79cf578e21b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -487,39 +487,29 @@ struct mem_section {
 	unsigned long section_mem_map;
 };
 
-#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
-/*
- * Should we ever require GCC 4 or later then the flat array scheme
- * can be eliminated and a uniform solution for EXTREME and !EXTREME can
- * be arrived at.
- */
-#define SECTION_ROOT_SHIFT	(PAGE_SHIFT-3)
-#define SECTION_ROOT_MASK	((1UL<<SECTION_ROOT_SHIFT) - 1)
-#define SECTION_TO_ROOT(_sec)	((_sec) >> SECTION_ROOT_SHIFT)
-#define NR_SECTION_ROOTS	(NR_MEM_SECTIONS >>  SECTION_ROOT_SHIFT)
+#ifdef CONFIG_SPARSEMEM_EXTREME
+#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
+#else
+#define SECTIONS_PER_ROOT	1
+#endif
 
-extern struct mem_section *mem_section[NR_SECTION_ROOTS];
-
-static inline struct mem_section *__nr_to_section(unsigned long nr)
-{
-	if (!mem_section[SECTION_TO_ROOT(nr)])
-		return NULL;
-	return &mem_section[SECTION_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
-}
+#define SECTION_NR_TO_ROOT(sec)	((sec) / SECTIONS_PER_ROOT)
+#define NR_SECTION_ROOTS	(NR_MEM_SECTIONS / SECTIONS_PER_ROOT)
+#define SECTION_ROOT_MASK	(SECTIONS_PER_ROOT - 1)
 
+#ifdef CONFIG_SPARSEMEM_EXTREME
+extern struct mem_section *mem_section[NR_SECTION_ROOTS];
 #else
-
-extern struct mem_section mem_section[NR_MEM_SECTIONS];
+extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
+#endif
 
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
-	return &mem_section[nr];
+	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
+		return NULL;
+	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 
-#define sparse_index_init(_sec, _nid)  do {} while (0)
-
-#endif
-
 /*
  * We use the lower bits of the mem_map pointer to store
  * a little bit of information.  There should be at least
diff --git a/mm/Kconfig b/mm/Kconfig
index fc644c5c065..4e9937ac352 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -90,11 +90,24 @@ config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
 
+#
+# SPARSEMEM_EXTREME (which is the default) does some bootmem
+# allocations when memory_present() is called.  If this can not
+# be done on your architecture, select this option.  However,
+# statically allocating the mem_section[] array can potentially
+# consume vast quantities of .bss, so be careful.
+#
+# This option will also potentially produce smaller runtime code
+# with gcc 3.4 and later.
+#
+config SPARSEMEM_STATIC
+	def_bool n
+
 #
 # Architectecture platforms which require a two level mem_section in SPARSEMEM
 # must select this option. This is usually for architecture platforms with
 # an extremely sparse physical address space.
 #
-config ARCH_SPARSEMEM_EXTREME
-	def_bool n
-	depends on SPARSEMEM && 64BIT
+config SPARSEMEM_EXTREME
+	def_bool y
+	depends on SPARSEMEM && !SPARSEMEM_STATIC
diff --git a/mm/sparse.c b/mm/sparse.c
index b2b456bf0a5..fa01292157a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,28 +13,36 @@
  *
  * 1) mem_section	- memory sections, mem_map's for valid memory
  */
-#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
+#ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
 	____cacheline_maxaligned_in_smp;
+#else
+struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
+	____cacheline_maxaligned_in_smp;
+#endif
+EXPORT_SYMBOL(mem_section);
+
+static void sparse_alloc_root(unsigned long root, int nid)
+{
+#ifdef CONFIG_SPARSEMEM_EXTREME
+	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+#endif
+}
 
 static void sparse_index_init(unsigned long section, int nid)
 {
-	unsigned long root = SECTION_TO_ROOT(section);
+	unsigned long root = SECTION_NR_TO_ROOT(section);
 
 	if (mem_section[root])
 		return;
-	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+
+	sparse_alloc_root(root, nid);
+
 	if (mem_section[root])
 		memset(mem_section[root], 0, PAGE_SIZE);
 	else
 		panic("memory_present: NO MEMORY\n");
 }
-#else
-struct mem_section mem_section[NR_MEM_SECTIONS]
-	____cacheline_maxaligned_in_smp;
-#endif
-EXPORT_SYMBOL(mem_section);
-
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3-70-g09d2


From 28ae55c98e4d16eac9a05a8a259d7763ef3aeb18 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 3 Sep 2005 15:54:29 -0700
Subject: [PATCH] sparsemem extreme: hotplug preparation

This splits up sparse_index_alloc() into two pieces.  This is needed
because we'll allocate the memory for the second level in a different place
from where we actually consume it to keep the allocation from happening
underneath a lock

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  1 +
 mm/sparse.c            | 53 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 79cf578e21b..5ed471b58f4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -588,6 +588,7 @@ static inline int pfn_valid(unsigned long pfn)
 void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
+#define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
diff --git a/mm/sparse.c b/mm/sparse.c
index fa01292157a..347249a4917 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/spinlock.h>
 #include <asm/dma.h>
 
 /*
@@ -22,27 +23,55 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
 #endif
 EXPORT_SYMBOL(mem_section);
 
-static void sparse_alloc_root(unsigned long root, int nid)
-{
 #ifdef CONFIG_SPARSEMEM_EXTREME
-	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
-#endif
+static struct mem_section *sparse_index_alloc(int nid)
+{
+	struct mem_section *section = NULL;
+	unsigned long array_size = SECTIONS_PER_ROOT *
+				   sizeof(struct mem_section);
+
+	section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+
+	if (section)
+		memset(section, 0, array_size);
+
+	return section;
 }
 
-static void sparse_index_init(unsigned long section, int nid)
+static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	unsigned long root = SECTION_NR_TO_ROOT(section);
+	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
+	struct mem_section *section;
+	int ret = 0;
 
 	if (mem_section[root])
-		return;
+		return -EEXIST;
 
-	sparse_alloc_root(root, nid);
+	section = sparse_index_alloc(nid);
+	/*
+	 * This lock keeps two different sections from
+	 * reallocating for the same index
+	 */
+	spin_lock(&index_init_lock);
 
-	if (mem_section[root])
-		memset(mem_section[root], 0, PAGE_SIZE);
-	else
-		panic("memory_present: NO MEMORY\n");
+	if (mem_section[root]) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	mem_section[root] = section;
+out:
+	spin_unlock(&index_init_lock);
+	return ret;
 }
+#else /* !SPARSEMEM_EXTREME */
+static inline int sparse_index_init(unsigned long section_nr, int nid)
+{
+	return 0;
+}
+#endif
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3-70-g09d2


From b0d9bcd4bb79a7834f8492f2ae5c2655a551f23d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:31 -0700
Subject: [PATCH] swap: update swapfile i_sem comment

Update swap extents comment: nowadays we guard with S_SWAPFILE not i_sem.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 60cd24a5520..9f46d83b4ec 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -926,7 +926,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  * requirements, they are simply tossed out - we will never use those blocks
  * for swapping.
  *
- * For S_ISREG swapfiles we hold i_sem across the life of the swapon.  This
+ * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
  * prevents root from shooting her foot off by ftruncating an in-use swapfile,
  * which will scribble on the fs.
  *
-- 
cgit v1.2.3-70-g09d2


From e2244ec2efa4ee1edf391d0001d314933e2b2974 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:32 -0700
Subject: [PATCH] swap: correct swapfile nr_good_pages

If a regular swapfile lies on a filesystem whose blocksize is less than
PAGE_SIZE, then setup_swap_extents may have to cut the number of usable swap
pages; but sys_swapon's nr_good_pages was not expecting that.  Also,
setup_swap_extents takes no account of badpages listed in the swap header: not
worth doing so, but ensure nr_badpages is 0 for a regular swapfile.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9f46d83b4ec..5ac5333f37a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1008,8 +1008,9 @@ reprobe:
 	}
 	ret = 0;
 	if (page_no == 0)
-		ret = -EINVAL;
+		page_no = 1;	/* force Empty message */
 	sis->max = page_no;
+	sis->pages = page_no - 1;
 	sis->highest_bit = page_no - 1;
 done:
 	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
@@ -1446,6 +1447,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->highest_bit = maxpages - 1;
 
 		error = -EINVAL;
+		if (!maxpages)
+			goto bad_swap;
+		if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+			goto bad_swap;
 		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 			goto bad_swap;
 		
@@ -1470,25 +1475,27 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		if (error) 
 			goto bad_swap;
 	}
-	
+
 	if (swapfilesize && maxpages > swapfilesize) {
 		printk(KERN_WARNING
 		       "Swap area shorter than signature indicates\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
+	if (nr_good_pages) {
+		p->swap_map[0] = SWAP_MAP_BAD;
+		p->max = maxpages;
+		p->pages = nr_good_pages;
+		error = setup_swap_extents(p);
+		if (error)
+			goto bad_swap;
+		nr_good_pages = p->pages;
+	}
 	if (!nr_good_pages) {
 		printk(KERN_WARNING "Empty swap-file\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	p->swap_map[0] = SWAP_MAP_BAD;
-	p->max = maxpages;
-	p->pages = nr_good_pages;
-
-	error = setup_swap_extents(p);
-	if (error)
-		goto bad_swap;
 
 	down(&swapon_sem);
 	swap_list_lock();
-- 
cgit v1.2.3-70-g09d2


From 4cd3bb10ff0b21b77b5a4cd13b4bd36694e054c4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:33 -0700
Subject: [PATCH] swap: move destroy_swap_extents calls

sys_swapon's call to destroy_swap_extents on failure is made after the final
swap_list_unlock, which is faintly unsafe: another sys_swapon might already be
setting up that swap_info_struct.  Calling it earlier, before taking
swap_list_lock, is safe.  sys_swapoff's call to destroy_swap_extents was safe,
but likewise move it earlier, before taking the locks (once try_to_unuse has
completed, nothing can be needing the swap extents).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5ac5333f37a..4b39e9501d4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1129,6 +1129,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		swap_list_unlock();
 		goto out_dput;
 	}
+	destroy_swap_extents(p);
 	down(&swapon_sem);
 	swap_list_lock();
 	drain_mmlist();
@@ -1139,7 +1140,6 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
-	destroy_swap_extents(p);
 	swap_device_unlock(p);
 	swap_list_unlock();
 	up(&swapon_sem);
@@ -1531,6 +1531,7 @@ bad_swap:
 		set_blocksize(bdev, p->old_block_size);
 		bd_release(bdev);
 	}
+	destroy_swap_extents(p);
 bad_swap_2:
 	swap_list_lock();
 	swap_map = p->swap_map;
@@ -1540,7 +1541,6 @@ bad_swap_2:
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
 	swap_list_unlock();
-	destroy_swap_extents(p);
 	vfree(swap_map);
 	if (swap_file)
 		filp_close(swap_file, NULL);
-- 
cgit v1.2.3-70-g09d2


From 11d31886dbcb61039ed3789e583d21c6e70960fd Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:34 -0700
Subject: [PATCH] swap: swap extent list is ordered

There are several comments that swap's extent_list.prev points to the lowest
extent: that's not so, it's extent_list.next which points to it, as you'd
expect.  And a couple of loops in add_swap_extent which go all the way through
the list, when they should just add to the other end.

Fix those up, and let map_swap_page search the list forwards: profiles shows
it to be twice as quick that way - because prefetch works better on how the
structs are typically kmalloc'ed?  or because usually more is written to than
read from swap, and swap is allocated ascendingly?

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 --
 mm/swapfile.c        | 27 +++++++++------------------
 2 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bfe3e763ccf..38f288475e6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -116,8 +116,6 @@ enum {
 
 /*
  * The in-memory structure used to track swap areas.
- * extent_list.prev points at the lowest-index extent.  That list is
- * sorted.
  */
 struct swap_info_struct {
 	unsigned int flags;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4b39e9501d4..73521d39e98 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -832,9 +832,9 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
 				offset < (se->start_page + se->nr_pages)) {
 			return se->start_block + (offset - se->start_page);
 		}
-		lh = se->list.prev;
+		lh = se->list.next;
 		if (lh == &sis->extent_list)
-			lh = lh->prev;
+			lh = lh->next;
 		se = list_entry(lh, struct swap_extent, list);
 		sis->curr_swap_extent = se;
 		BUG_ON(se == start_se);		/* It *must* be present */
@@ -859,10 +859,9 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 
 /*
  * Add a block range (and the corresponding page range) into this swapdev's
- * extent list.  The extent list is kept sorted in block order.
+ * extent list.  The extent list is kept sorted in page order.
  *
- * This function rather assumes that it is called in ascending sector_t order.
- * It doesn't look for extent coalescing opportunities.
+ * This function rather assumes that it is called in ascending page order.
  */
 static int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
@@ -872,16 +871,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	struct swap_extent *new_se;
 	struct list_head *lh;
 
-	lh = sis->extent_list.next;	/* The highest-addressed block */
-	while (lh != &sis->extent_list) {
+	lh = sis->extent_list.prev;	/* The highest page extent */
+	if (lh != &sis->extent_list) {
 		se = list_entry(lh, struct swap_extent, list);
-		if (se->start_block + se->nr_pages == start_block &&
-		    se->start_page  + se->nr_pages == start_page) {
+		BUG_ON(se->start_page + se->nr_pages != start_page);
+		if (se->start_block + se->nr_pages == start_block) {
 			/* Merge it */
 			se->nr_pages += nr_pages;
 			return 0;
 		}
-		lh = lh->next;
 	}
 
 	/*
@@ -894,14 +892,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	new_se->nr_pages = nr_pages;
 	new_se->start_block = start_block;
 
-	lh = sis->extent_list.prev;	/* The lowest block */
-	while (lh != &sis->extent_list) {
-		se = list_entry(lh, struct swap_extent, list);
-		if (se->start_block > start_block)
-			break;
-		lh = lh->prev;
-	}
-	list_add_tail(&new_se->list, lh);
+	list_add_tail(&new_se->list, &sis->extent_list);
 	sis->nr_extents++;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 53092a7402f227151a681b0c92ec8598c5618b1a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:34 -0700
Subject: [PATCH] swap: show span of swap extents

The "Adding %dk swap" message shows the number of swap extents, as a guide to
how fragmented the swapfile may be.  But a useful further guide is what total
extent they span across (sometimes scarily large).

And there's no need to keep nr_extents in swap_info: it's unused after the
initial message, so save a little space by keeping it on stack.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  1 -
 mm/swapfile.c        | 44 ++++++++++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38f288475e6..f2b16ac0b53 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -123,7 +123,6 @@ struct swap_info_struct {
 	struct file *swap_file;
 	struct block_device *bdev;
 	struct list_head extent_list;
-	int nr_extents;
 	struct swap_extent *curr_swap_extent;
 	unsigned old_block_size;
 	unsigned short * swap_map;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 73521d39e98..d4da84ee392 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -854,7 +854,6 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 		list_del(&se->list);
 		kfree(se);
 	}
-	sis->nr_extents = 0;
 }
 
 /*
@@ -893,8 +892,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	new_se->start_block = start_block;
 
 	list_add_tail(&new_se->list, &sis->extent_list);
-	sis->nr_extents++;
-	return 0;
+	return 1;
 }
 
 /*
@@ -928,7 +926,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  * This is extremely effective.  The average number of iterations in
  * map_swap_page() has been measured at about 0.3 per page.  - akpm.
  */
-static int setup_swap_extents(struct swap_info_struct *sis)
+static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
 	struct inode *inode;
 	unsigned blocks_per_page;
@@ -936,11 +934,15 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 	unsigned blkbits;
 	sector_t probe_block;
 	sector_t last_block;
+	sector_t lowest_block = -1;
+	sector_t highest_block = 0;
+	int nr_extents = 0;
 	int ret;
 
 	inode = sis->swap_file->f_mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
+		*span = sis->pages;
 		goto done;
 	}
 
@@ -985,19 +987,28 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 			}
 		}
 
+		first_block >>= (PAGE_SHIFT - blkbits);
+		if (page_no) {	/* exclude the header page */
+			if (first_block < lowest_block)
+				lowest_block = first_block;
+			if (first_block > highest_block)
+				highest_block = first_block;
+		}
+
 		/*
 		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 		 */
-		ret = add_swap_extent(sis, page_no, 1,
-				first_block >> (PAGE_SHIFT - blkbits));
-		if (ret)
+		ret = add_swap_extent(sis, page_no, 1, first_block);
+		if (ret < 0)
 			goto out;
+		nr_extents += ret;
 		page_no++;
 		probe_block += blocks_per_page;
 reprobe:
 		continue;
 	}
-	ret = 0;
+	ret = nr_extents;
+	*span = 1 + highest_block - lowest_block;
 	if (page_no == 0)
 		page_no = 1;	/* force Empty message */
 	sis->max = page_no;
@@ -1265,6 +1276,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	union swap_header *swap_header = NULL;
 	int swap_header_version;
 	int nr_good_pages = 0;
+	int nr_extents;
+	sector_t span;
 	unsigned long maxpages = 1;
 	int swapfilesize;
 	unsigned short *swap_map;
@@ -1300,7 +1313,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		nr_swapfiles = type+1;
 	INIT_LIST_HEAD(&p->extent_list);
 	p->flags = SWP_USED;
-	p->nr_extents = 0;
 	p->swap_file = NULL;
 	p->old_block_size = 0;
 	p->swap_map = NULL;
@@ -1477,9 +1489,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->swap_map[0] = SWAP_MAP_BAD;
 		p->max = maxpages;
 		p->pages = nr_good_pages;
-		error = setup_swap_extents(p);
-		if (error)
+		nr_extents = setup_swap_extents(p, &span);
+		if (nr_extents < 0) {
+			error = nr_extents;
 			goto bad_swap;
+		}
 		nr_good_pages = p->pages;
 	}
 	if (!nr_good_pages) {
@@ -1494,9 +1508,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	p->flags = SWP_ACTIVE;
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
-	printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
-		nr_good_pages<<(PAGE_SHIFT-10), name,
-		p->prio, p->nr_extents);
+
+	printk(KERN_INFO "Adding %dk swap on %s.  "
+			"Priority:%d extents:%d across:%lluk\n",
+		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
 
 	/* insert swap space into swap_list: */
 	prev = -1;
-- 
cgit v1.2.3-70-g09d2


From 6eb396dc4a9781c5e7951143ab56ce5710687ab3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:35 -0700
Subject: [PATCH] swap: swap unsigned int consistency

The swap header's unsigned int last_page determines the range of swap pages,
but swap_info has been using int or unsigned long in some cases: use unsigned
int throughout (except, in several places a local unsigned long is useful to
avoid overflows when adding).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  6 +++---
 mm/swapfile.c        | 19 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f2b16ac0b53..93f0eca7f91 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -130,10 +130,10 @@ struct swap_info_struct {
 	unsigned int highest_bit;
 	unsigned int cluster_next;
 	unsigned int cluster_nr;
+	unsigned int pages;
+	unsigned int max;
+	unsigned int inuse_pages;
 	int prio;			/* swap priority */
-	int pages;
-	unsigned long max;
-	unsigned long inuse_pages;
 	int next;			/* next entry on swap list */
 };
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d4da84ee392..6cc6dfb4d27 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -84,7 +84,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	up_read(&swap_unplug_sem);
 }
 
-static inline int scan_swap_map(struct swap_info_struct *si)
+static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
 	/* 
@@ -531,10 +531,11 @@ static int unuse_mm(struct mm_struct *mm,
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
  */
-static int find_next_to_unuse(struct swap_info_struct *si, int prev)
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+					unsigned int prev)
 {
-	int max = si->max;
-	int i = prev;
+	unsigned int max = si->max;
+	unsigned int i = prev;
 	int count;
 
 	/*
@@ -577,7 +578,7 @@ static int try_to_unuse(unsigned int type)
 	unsigned short swcount;
 	struct page *page;
 	swp_entry_t entry;
-	int i = 0;
+	unsigned int i = 0;
 	int retval = 0;
 	int reset_overflow = 0;
 	int shmem;
@@ -1216,7 +1217,7 @@ static int swap_show(struct seq_file *swap, void *v)
 
 	file = ptr->swap_file;
 	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
-	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
+	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
 		       len < 40 ? 40 - len : 1, " ",
 		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
 				"partition" : "file\t",
@@ -1275,8 +1276,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	static int least_priority;
 	union swap_header *swap_header = NULL;
 	int swap_header_version;
-	int nr_good_pages = 0;
-	int nr_extents;
+	unsigned int nr_good_pages = 0;
+	int nr_extents = 0;
 	sector_t span;
 	unsigned long maxpages = 1;
 	int swapfilesize;
@@ -1509,7 +1510,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
 
-	printk(KERN_INFO "Adding %dk swap on %s.  "
+	printk(KERN_INFO "Adding %uk swap on %s.  "
 			"Priority:%d extents:%d across:%lluk\n",
 		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
-- 
cgit v1.2.3-70-g09d2


From 89d09a2c80ea6baafb559b86d545fada05e14ab5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:36 -0700
Subject: [PATCH] swap: freeing update swap_list.next

This makes negligible difference in practice: but swap_list.next should not be
updated to a higher prio in the general helper swap_info_get, but rather in
swap_entry_free; and then only in the case when entry is actually freed.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cc6dfb4d27..62e0da8f7e6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -215,8 +215,6 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 	if (!p->swap_map[offset])
 		goto bad_free;
 	swap_list_lock();
-	if (p->prio > swap_info[swap_list.next].prio)
-		swap_list.next = type;
 	swap_device_lock(p);
 	return p;
 
@@ -253,6 +251,8 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 				p->lowest_bit = offset;
 			if (offset > p->highest_bit)
 				p->highest_bit = offset;
+			if (p->prio > swap_info[swap_list.next].prio)
+				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
 		}
-- 
cgit v1.2.3-70-g09d2


From fb4f88dcabdc716c7c350e09cf4a38a419b007e1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:37 -0700
Subject: [PATCH] swap: get_swap_page drop swap_list_lock

Rewrite get_swap_page to allocate in just the same sequence as before, but
without holding swap_list_lock across its scan_swap_map.  Decrement
nr_swap_pages and update swap_list.next in advance, while still holding
swap_list_lock.  Skip full devices by testing highest_bit.  Swapoff hold
swap_device_lock as well as swap_list_lock to clear SWP_WRITEOK.  Reduces lock
contention when there are parallel swap devices of the same priority.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 75 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 36 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62e0da8f7e6..e54d60af6b5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,6 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 		}
 		si->swap_map[offset] = 1;
 		si->inuse_pages++;
-		nr_swap_pages--;
 		si->cluster_next = offset+1;
 		return offset;
 	}
@@ -150,50 +149,45 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 
 swp_entry_t get_swap_page(void)
 {
-	struct swap_info_struct * p;
-	unsigned long offset;
-	swp_entry_t entry;
-	int type, wrapped = 0;
+	struct swap_info_struct *si;
+	pgoff_t offset;
+	int type, next;
+	int wrapped = 0;
 
-	entry.val = 0;	/* Out of memory */
 	swap_list_lock();
-	type = swap_list.next;
-	if (type < 0)
-		goto out;
 	if (nr_swap_pages <= 0)
-		goto out;
-
-	while (1) {
-		p = &swap_info[type];
-		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
-			swap_device_lock(p);
-			offset = scan_swap_map(p);
-			swap_device_unlock(p);
-			if (offset) {
-				entry = swp_entry(type,offset);
-				type = swap_info[type].next;
-				if (type < 0 ||
-					p->prio != swap_info[type].prio) {
-						swap_list.next = swap_list.head;
-				} else {
-					swap_list.next = type;
-				}
-				goto out;
-			}
+		goto noswap;
+	nr_swap_pages--;
+
+	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+		si = swap_info + type;
+		next = si->next;
+		if (next < 0 ||
+		    (!wrapped && si->prio != swap_info[next].prio)) {
+			next = swap_list.head;
+			wrapped++;
 		}
-		type = p->next;
-		if (!wrapped) {
-			if (type < 0 || p->prio != swap_info[type].prio) {
-				type = swap_list.head;
-				wrapped = 1;
-			}
-		} else
-			if (type < 0)
-				goto out;	/* out of swap space */
+
+		if (!si->highest_bit)
+			continue;
+		if (!(si->flags & SWP_WRITEOK))
+			continue;
+
+		swap_list.next = next;
+		swap_device_lock(si);
+		swap_list_unlock();
+		offset = scan_swap_map(si);
+		swap_device_unlock(si);
+		if (offset)
+			return swp_entry(type, offset);
+		swap_list_lock();
+		next = swap_list.next;
 	}
-out:
+
+	nr_swap_pages++;
+noswap:
 	swap_list_unlock();
-	return entry;
+	return (swp_entry_t) {0};
 }
 
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
@@ -1105,8 +1099,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	nr_swap_pages -= p->pages;
 	total_swap_pages -= p->pages;
+	swap_device_lock(p);
 	p->flags &= ~SWP_WRITEOK;
+	swap_device_unlock(p);
 	swap_list_unlock();
+
 	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
 	current->flags &= ~PF_SWAPOFF;
-- 
cgit v1.2.3-70-g09d2


From 7dfad4183bf9cd92f977caa3c12cc74f0eefc0e6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:38 -0700
Subject: [PATCH] swap: scan_swap_map restyled

Rewrite scan_swap_map to allocate in just the same way as before (taking the
next free entry SWAPFILE_CLUSTER-1 times, then restarting at the lowest wholly
empty cluster, falling back to lowest entry if none), but with a view towards
dropping the lock in the next patch.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 93 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 48 insertions(+), 45 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index e54d60af6b5..c70248aab53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -86,64 +86,67 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
-	unsigned long offset;
+	unsigned long offset, last_in_cluster;
+
 	/* 
-	 * We try to cluster swap pages by allocating them
-	 * sequentially in swap.  Once we've allocated
-	 * SWAPFILE_CLUSTER pages this way, however, we resort to
-	 * first-free allocation, starting a new cluster.  This
-	 * prevents us from scattering swap pages all over the entire
-	 * swap partition, so that we reduce overall disk seek times
-	 * between swap pages.  -- sct */
-	if (si->cluster_nr) {
-		while (si->cluster_next <= si->highest_bit) {
-			offset = si->cluster_next++;
+	 * We try to cluster swap pages by allocating them sequentially
+	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
+	 * way, however, we resort to first-free allocation, starting
+	 * a new cluster.  This prevents us from scattering swap pages
+	 * all over the entire swap partition, so that we reduce
+	 * overall disk seek times between swap pages.  -- sct
+	 * But we do now try to find an empty cluster.  -Andrea
+	 */
+
+	if (unlikely(!si->cluster_nr)) {
+		si->cluster_nr = SWAPFILE_CLUSTER - 1;
+		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+			goto lowest;
+
+		offset = si->lowest_bit;
+		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+		/* Locate the first empty (unaligned) cluster */
+		for (; last_in_cluster <= si->highest_bit; offset++) {
 			if (si->swap_map[offset])
-				continue;
-			si->cluster_nr--;
-			goto got_page;
-		}
-	}
-	si->cluster_nr = SWAPFILE_CLUSTER;
-
-	/* try to find an empty (even not aligned) cluster. */
-	offset = si->lowest_bit;
- check_next_cluster:
-	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
-	{
-		unsigned long nr;
-		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
-			if (si->swap_map[nr])
-			{
-				offset = nr+1;
-				goto check_next_cluster;
+				last_in_cluster = offset + SWAPFILE_CLUSTER;
+			else if (offset == last_in_cluster) {
+				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				goto cluster;
 			}
-		/* We found a completly empty cluster, so start
-		 * using it.
-		 */
-		goto got_page;
+		}
+		goto lowest;
 	}
-	/* No luck, so now go finegrined as usual. -Andrea */
-	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
-		if (si->swap_map[offset])
-			continue;
-		si->lowest_bit = offset+1;
-	got_page:
-		if (offset == si->lowest_bit)
+
+	si->cluster_nr--;
+cluster:
+	offset = si->cluster_next;
+	if (offset > si->highest_bit)
+lowest:		offset = si->lowest_bit;
+	if (!si->highest_bit)
+		goto no_page;
+	if (!si->swap_map[offset]) {
+got_page:	if (offset == si->lowest_bit)
 			si->lowest_bit++;
 		if (offset == si->highest_bit)
 			si->highest_bit--;
-		if (si->lowest_bit > si->highest_bit) {
+		si->inuse_pages++;
+		if (si->inuse_pages == si->pages) {
 			si->lowest_bit = si->max;
 			si->highest_bit = 0;
 		}
 		si->swap_map[offset] = 1;
-		si->inuse_pages++;
-		si->cluster_next = offset+1;
+		si->cluster_next = offset + 1;
 		return offset;
 	}
-	si->lowest_bit = si->max;
-	si->highest_bit = 0;
+
+	while (++offset <= si->highest_bit) {
+		if (!si->swap_map[offset])
+			goto got_page;
+	}
+	goto lowest;
+
+no_page:
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 52b7efdbe5f5696fc80338560a3fc51e0b0a993c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:39 -0700
Subject: [PATCH] swap: scan_swap_map drop swap_device_lock

get_swap_page has often shown up on latency traces, doing lengthy scans while
holding two spinlocks.  swap_list_lock is already dropped, now scan_swap_map
drop swap_device_lock before scanning the swap_map.

While scanning for an empty cluster, don't worry that racing tasks may
allocate what was free and free what was allocated; but when allocating an
entry, check it's still free after retaking the lock.  Avoid dropping the lock
in the expected common path.  No barriers beyond the locks, just let the
cookie crumble; highest_bit limit is volatile, but benign.

Guard against swapoff: must check SWP_WRITEOK before allocating, must raise
SWP_SCANNING reference count while in scan_swap_map, swapoff wait for that to
fall - just use schedule_timeout, we don't want to burden scan_swap_map
itself, and it's very unlikely that anyone can really still be in
scan_swap_map once swapoff gets this far.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 ++
 mm/swapfile.c        | 42 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 37 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 93f0eca7f91..db3b5de7c92 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -107,6 +107,8 @@ enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
 	SWP_ACTIVE	= (SWP_USED | SWP_WRITEOK),
+					/* add others here before... */
+	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c70248aab53..fdee145afc6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -98,10 +98,12 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 	 * But we do now try to find an empty cluster.  -Andrea
 	 */
 
+	si->flags += SWP_SCANNING;
 	if (unlikely(!si->cluster_nr)) {
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
 			goto lowest;
+		swap_device_unlock(si);
 
 		offset = si->lowest_bit;
 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
@@ -111,10 +113,12 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 			if (si->swap_map[offset])
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
+				swap_device_lock(si);
 				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
 				goto cluster;
 			}
 		}
+		swap_device_lock(si);
 		goto lowest;
 	}
 
@@ -123,10 +127,12 @@ cluster:
 	offset = si->cluster_next;
 	if (offset > si->highest_bit)
 lowest:		offset = si->lowest_bit;
+checks:	if (!(si->flags & SWP_WRITEOK))
+		goto no_page;
 	if (!si->highest_bit)
 		goto no_page;
 	if (!si->swap_map[offset]) {
-got_page:	if (offset == si->lowest_bit)
+		if (offset == si->lowest_bit)
 			si->lowest_bit++;
 		if (offset == si->highest_bit)
 			si->highest_bit--;
@@ -137,16 +143,22 @@ got_page:	if (offset == si->lowest_bit)
 		}
 		si->swap_map[offset] = 1;
 		si->cluster_next = offset + 1;
+		si->flags -= SWP_SCANNING;
 		return offset;
 	}
 
+	swap_device_unlock(si);
 	while (++offset <= si->highest_bit) {
-		if (!si->swap_map[offset])
-			goto got_page;
+		if (!si->swap_map[offset]) {
+			swap_device_lock(si);
+			goto checks;
+		}
 	}
+	swap_device_lock(si);
 	goto lowest;
 
 no_page:
+	si->flags -= SWP_SCANNING;
 	return 0;
 }
 
@@ -1111,10 +1123,6 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	err = try_to_unuse(type);
 	current->flags &= ~PF_SWAPOFF;
 
-	/* wait for any unplug function to finish */
-	down_write(&swap_unplug_sem);
-	up_write(&swap_unplug_sem);
-
 	if (err) {
 		/* re-insert swap space back into swap_list */
 		swap_list_lock();
@@ -1128,10 +1136,28 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 			swap_info[prev].next = p - swap_info;
 		nr_swap_pages += p->pages;
 		total_swap_pages += p->pages;
+		swap_device_lock(p);
 		p->flags |= SWP_WRITEOK;
+		swap_device_unlock(p);
 		swap_list_unlock();
 		goto out_dput;
 	}
+
+	/* wait for any unplug function to finish */
+	down_write(&swap_unplug_sem);
+	up_write(&swap_unplug_sem);
+
+	/* wait for anyone still in scan_swap_map */
+	swap_device_lock(p);
+	p->highest_bit = 0;		/* cuts scans short */
+	while (p->flags >= SWP_SCANNING) {
+		swap_device_unlock(p);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(1);
+		swap_device_lock(p);
+	}
+	swap_device_unlock(p);
+
 	destroy_swap_extents(p);
 	down(&swapon_sem);
 	swap_list_lock();
@@ -1431,6 +1457,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		}
 
 		p->lowest_bit  = 1;
+		p->cluster_next = 1;
+
 		/*
 		 * Find out how many pages are allowed for a single swap
 		 * device. There are two limiting factors: 1) the number of
-- 
cgit v1.2.3-70-g09d2


From 048c27fd72816b44e096997d1c6901c3abbfd45b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:40 -0700
Subject: [PATCH] swap: scan_swap_map latency breaks

The get_swap_page/scan_swap_map latency can be so bad that even those without
preemption configured deserve relief: periodically cond_resched.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index fdee145afc6..e675ae55f87 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -56,8 +56,6 @@ static DECLARE_MUTEX(swapon_sem);
  */
 static DECLARE_RWSEM(swap_unplug_sem);
 
-#define SWAPFILE_CLUSTER 256
-
 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 {
 	swp_entry_t entry;
@@ -84,9 +82,13 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	up_read(&swap_unplug_sem);
 }
 
+#define SWAPFILE_CLUSTER	256
+#define LATENCY_LIMIT		256
+
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset, last_in_cluster;
+	int latency_ration = LATENCY_LIMIT;
 
 	/* 
 	 * We try to cluster swap pages by allocating them sequentially
@@ -117,6 +119,10 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
 				goto cluster;
 			}
+			if (unlikely(--latency_ration < 0)) {
+				cond_resched();
+				latency_ration = LATENCY_LIMIT;
+			}
 		}
 		swap_device_lock(si);
 		goto lowest;
@@ -153,6 +159,10 @@ checks:	if (!(si->flags & SWP_WRITEOK))
 			swap_device_lock(si);
 			goto checks;
 		}
+		if (unlikely(--latency_ration < 0)) {
+			cond_resched();
+			latency_ration = LATENCY_LIMIT;
+		}
 	}
 	swap_device_lock(si);
 	goto lowest;
-- 
cgit v1.2.3-70-g09d2


From 5d337b9194b1ce3b6fd5f3cb2799455ed2f9a3d1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:41 -0700
Subject: [PATCH] swap: swap_lock replace list+device

The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.

The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention.  However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split.  Certainly the split is mere
overhead in the common case of a single swap device.

So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).

If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/vm/locking |  15 +++---
 include/linux/swap.h     |  11 +----
 mm/filemap.c             |   7 ++-
 mm/rmap.c                |   3 +-
 mm/swapfile.c            | 125 ++++++++++++++++++++---------------------------
 5 files changed, 66 insertions(+), 95 deletions(-)

(limited to 'mm')

diff --git a/Documentation/vm/locking b/Documentation/vm/locking
index c3ef09ae3bb..f366fa95617 100644
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -83,19 +83,18 @@ single address space optimization, so that the zap_page_range (from
 vmtruncate) does not lose sending ipi's to cloned threads that might 
 be spawned underneath it and go to user mode to drag in pte's into tlbs.
 
-swap_list_lock/swap_device_lock
--------------------------------
+swap_lock
+--------------
 The swap devices are chained in priority order from the "swap_list" header. 
 The "swap_list" is used for the round-robin swaphandle allocation strategy.
 The #free swaphandles is maintained in "nr_swap_pages". These two together
-are protected by the swap_list_lock. 
+are protected by the swap_lock.
 
-The swap_device_lock, which is per swap device, protects the reference 
-counts on the corresponding swaphandles, maintained in the "swap_map"
-array, and the "highest_bit" and "lowest_bit" fields.
+The swap_lock also protects all the device reference counts on the
+corresponding swaphandles, maintained in the "swap_map" array, and the
+"highest_bit" and "lowest_bit" fields.
 
-Both of these are spinlocks, and are never acquired from intr level. The
-locking hierarchy is swap_list_lock -> swap_device_lock.
+The swap_lock is a spinlock, and is never acquired from intr level.
 
 To prevent races between swap space deletion or async readahead swapins
 deciding whether a swap handle is being used, ie worthy of being read in
diff --git a/include/linux/swap.h b/include/linux/swap.h
index db3b5de7c92..3c9ff004815 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -121,7 +121,7 @@ enum {
  */
 struct swap_info_struct {
 	unsigned int flags;
-	spinlock_t sdev_lock;
+	int prio;			/* swap priority */
 	struct file *swap_file;
 	struct block_device *bdev;
 	struct list_head extent_list;
@@ -135,7 +135,6 @@ struct swap_info_struct {
 	unsigned int pages;
 	unsigned int max;
 	unsigned int inuse_pages;
-	int prio;			/* swap priority */
 	int next;			/* next entry on swap list */
 };
 
@@ -221,13 +220,7 @@ extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
-extern struct swap_list_t swap_list;
-extern spinlock_t swaplock;
-
-#define swap_list_lock()	spin_lock(&swaplock)
-#define swap_list_unlock()	spin_unlock(&swaplock)
-#define swap_device_lock(p)	spin_lock(&p->sdev_lock)
-#define swap_device_unlock(p)	spin_unlock(&p->sdev_lock)
+extern spinlock_t swap_lock;
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
diff --git a/mm/filemap.c b/mm/filemap.c
index c11418dd94e..edc54436fa9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -54,9 +54,8 @@
  *
  *  ->i_mmap_lock		(vmtruncate)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
- *      ->swap_list_lock
- *        ->swap_device_lock	(exclusive_swap_page, others)
- *          ->mapping->tree_lock
+ *      ->swap_lock		(exclusive_swap_page, others)
+ *        ->mapping->tree_lock
  *
  *  ->i_sem
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
@@ -86,7 +85,7 @@
  *    ->page_table_lock		(anon_vma_prepare and various)
  *
  *  ->page_table_lock
- *    ->swap_device_lock	(try_to_unmap_one)
+ *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
diff --git a/mm/rmap.c b/mm/rmap.c
index 08ac5c7fa91..facb8cdca66 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -34,9 +34,8 @@
  *       anon_vma->lock
  *         mm->page_table_lock
  *           zone->lru_lock (in mark_page_accessed)
- *           swap_list_lock (in swap_free etc's swap_info_get)
+ *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
- *             swap_device_lock (in swap_duplicate, swap_info_get)
  *             mapping->private_lock (in __set_page_dirty_buffers)
  *             inode_lock (in set_page_dirty's __mark_inode_dirty)
  *               sb_lock (within inode_lock in fs/fs-writeback.c)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e675ae55f87..4b6e8bf986b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,7 +31,7 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 
-DEFINE_SPINLOCK(swaplock);
+DEFINE_SPINLOCK(swap_lock);
 unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
@@ -51,7 +51,7 @@ static DECLARE_MUTEX(swapon_sem);
 
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
+ * hold swap_lock while calling the unplug_fn. And swap_lock
  * cannot be turned into a semaphore.
  */
 static DECLARE_RWSEM(swap_unplug_sem);
@@ -105,7 +105,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
 			goto lowest;
-		swap_device_unlock(si);
+		spin_unlock(&swap_lock);
 
 		offset = si->lowest_bit;
 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
@@ -115,7 +115,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 			if (si->swap_map[offset])
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
-				swap_device_lock(si);
+				spin_lock(&swap_lock);
 				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
 				goto cluster;
 			}
@@ -124,7 +124,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				latency_ration = LATENCY_LIMIT;
 			}
 		}
-		swap_device_lock(si);
+		spin_lock(&swap_lock);
 		goto lowest;
 	}
 
@@ -153,10 +153,10 @@ checks:	if (!(si->flags & SWP_WRITEOK))
 		return offset;
 	}
 
-	swap_device_unlock(si);
+	spin_unlock(&swap_lock);
 	while (++offset <= si->highest_bit) {
 		if (!si->swap_map[offset]) {
-			swap_device_lock(si);
+			spin_lock(&swap_lock);
 			goto checks;
 		}
 		if (unlikely(--latency_ration < 0)) {
@@ -164,7 +164,7 @@ checks:	if (!(si->flags & SWP_WRITEOK))
 			latency_ration = LATENCY_LIMIT;
 		}
 	}
-	swap_device_lock(si);
+	spin_lock(&swap_lock);
 	goto lowest;
 
 no_page:
@@ -179,7 +179,7 @@ swp_entry_t get_swap_page(void)
 	int type, next;
 	int wrapped = 0;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	if (nr_swap_pages <= 0)
 		goto noswap;
 	nr_swap_pages--;
@@ -199,19 +199,17 @@ swp_entry_t get_swap_page(void)
 			continue;
 
 		swap_list.next = next;
-		swap_device_lock(si);
-		swap_list_unlock();
 		offset = scan_swap_map(si);
-		swap_device_unlock(si);
-		if (offset)
+		if (offset) {
+			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
-		swap_list_lock();
+		}
 		next = swap_list.next;
 	}
 
 	nr_swap_pages++;
 noswap:
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	return (swp_entry_t) {0};
 }
 
@@ -233,8 +231,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 		goto bad_offset;
 	if (!p->swap_map[offset])
 		goto bad_free;
-	swap_list_lock();
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	return p;
 
 bad_free:
@@ -252,12 +249,6 @@ out:
 	return NULL;
 }	
 
-static void swap_info_put(struct swap_info_struct * p)
-{
-	swap_device_unlock(p);
-	swap_list_unlock();
-}
-
 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 {
 	int count = p->swap_map[offset];
@@ -290,7 +281,7 @@ void swap_free(swp_entry_t entry)
 	p = swap_info_get(entry);
 	if (p) {
 		swap_entry_free(p, swp_offset(entry));
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 }
 
@@ -308,7 +299,7 @@ static inline int page_swapcount(struct page *page)
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
 		count = p->swap_map[swp_offset(entry)] - 1;
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 	return count;
 }
@@ -365,7 +356,7 @@ int remove_exclusive_swap_page(struct page *page)
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 	}
-	swap_info_put(p);
+	spin_unlock(&swap_lock);
 
 	if (retval) {
 		swap_free(entry);
@@ -388,7 +379,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1)
 			page = find_trylock_page(&swapper_space, entry.val);
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 	if (page) {
 		int one_user;
@@ -558,10 +549,10 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 	int count;
 
 	/*
-	 * No need for swap_device_lock(si) here: we're just looking
+	 * No need for swap_lock here: we're just looking
 	 * for whether an entry is in use, not modifying it; false
 	 * hits are okay, and sys_swapoff() has already prevented new
-	 * allocations from this area (while holding swap_list_lock()).
+	 * allocations from this area (while holding swap_lock).
 	 */
 	for (;;) {
 		if (++i >= max) {
@@ -751,9 +742,9 @@ static int try_to_unuse(unsigned int type)
 		 * report them; but do report if we reset SWAP_MAP_MAX.
 		 */
 		if (*swap_map == SWAP_MAP_MAX) {
-			swap_device_lock(si);
+			spin_lock(&swap_lock);
 			*swap_map = 1;
-			swap_device_unlock(si);
+			spin_unlock(&swap_lock);
 			reset_overflow = 1;
 		}
 
@@ -817,9 +808,9 @@ static int try_to_unuse(unsigned int type)
 }
 
 /*
- * After a successful try_to_unuse, if no swap is now in use, we know we
- * can empty the mmlist.  swap_list_lock must be held on entry and exit.
- * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist.  swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
  * added to the mmlist just after page_duplicate - before would be racy.
  */
 static void drain_mmlist(void)
@@ -1092,7 +1083,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 
 	mapping = victim->f_mapping;
 	prev = -1;
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
 		p = swap_info + type;
 		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
@@ -1103,14 +1094,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	if (type < 0) {
 		err = -EINVAL;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 	if (!security_vm_enough_memory(p->pages))
 		vm_unacct_memory(p->pages);
 	else {
 		err = -ENOMEM;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 	if (prev < 0) {
@@ -1124,10 +1115,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	nr_swap_pages -= p->pages;
 	total_swap_pages -= p->pages;
-	swap_device_lock(p);
 	p->flags &= ~SWP_WRITEOK;
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 
 	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
@@ -1135,7 +1124,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 
 	if (err) {
 		/* re-insert swap space back into swap_list */
-		swap_list_lock();
+		spin_lock(&swap_lock);
 		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
 			if (p->prio >= swap_info[i].prio)
 				break;
@@ -1146,10 +1135,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 			swap_info[prev].next = p - swap_info;
 		nr_swap_pages += p->pages;
 		total_swap_pages += p->pages;
-		swap_device_lock(p);
 		p->flags |= SWP_WRITEOK;
-		swap_device_unlock(p);
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 
@@ -1157,30 +1144,27 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	down_write(&swap_unplug_sem);
 	up_write(&swap_unplug_sem);
 
+	destroy_swap_extents(p);
+	down(&swapon_sem);
+	spin_lock(&swap_lock);
+	drain_mmlist();
+
 	/* wait for anyone still in scan_swap_map */
-	swap_device_lock(p);
 	p->highest_bit = 0;		/* cuts scans short */
 	while (p->flags >= SWP_SCANNING) {
-		swap_device_unlock(p);
+		spin_unlock(&swap_lock);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
-		swap_device_lock(p);
+		spin_lock(&swap_lock);
 	}
-	swap_device_unlock(p);
 
-	destroy_swap_extents(p);
-	down(&swapon_sem);
-	swap_list_lock();
-	drain_mmlist();
-	swap_device_lock(p);
 	swap_file = p->swap_file;
 	p->swap_file = NULL;
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	up(&swapon_sem);
 	vfree(swap_map);
 	inode = mapping->host;
@@ -1324,7 +1308,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	p = swap_info;
 	for (type = 0 ; type < nr_swapfiles ; type++,p++)
 		if (!(p->flags & SWP_USED))
@@ -1343,7 +1327,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	 * swp_entry_t or the architecture definition of a swap pte.
 	 */
 	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out;
 	}
 	if (type >= nr_swapfiles)
@@ -1357,7 +1341,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	p->highest_bit = 0;
 	p->cluster_nr = 0;
 	p->inuse_pages = 0;
-	spin_lock_init(&p->sdev_lock);
 	p->next = -1;
 	if (swap_flags & SWAP_FLAG_PREFER) {
 		p->prio =
@@ -1365,7 +1348,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	} else {
 		p->prio = --least_priority;
 	}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	name = getname(specialfile);
 	error = PTR_ERR(name);
 	if (IS_ERR(name)) {
@@ -1542,8 +1525,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	}
 
 	down(&swapon_sem);
-	swap_list_lock();
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	p->flags = SWP_ACTIVE;
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
@@ -1567,8 +1549,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	up(&swapon_sem);
 	error = 0;
 	goto out;
@@ -1579,14 +1560,14 @@ bad_swap:
 	}
 	destroy_swap_extents(p);
 bad_swap_2:
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	swap_map = p->swap_map;
 	p->swap_file = NULL;
 	p->swap_map = NULL;
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	vfree(swap_map);
 	if (swap_file)
 		filp_close(swap_file, NULL);
@@ -1610,7 +1591,7 @@ void si_swapinfo(struct sysinfo *val)
 	unsigned int i;
 	unsigned long nr_to_be_unused = 0;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i = 0; i < nr_swapfiles; i++) {
 		if (!(swap_info[i].flags & SWP_USED) ||
 		     (swap_info[i].flags & SWP_WRITEOK))
@@ -1619,7 +1600,7 @@ void si_swapinfo(struct sysinfo *val)
 	}
 	val->freeswap = nr_swap_pages + nr_to_be_unused;
 	val->totalswap = total_swap_pages + nr_to_be_unused;
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 }
 
 /*
@@ -1640,7 +1621,7 @@ int swap_duplicate(swp_entry_t entry)
 	p = type + swap_info;
 	offset = swp_offset(entry);
 
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	if (offset < p->max && p->swap_map[offset]) {
 		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
 			p->swap_map[offset]++;
@@ -1652,7 +1633,7 @@ int swap_duplicate(swp_entry_t entry)
 			result = 1;
 		}
 	}
-	swap_device_unlock(p);
+	spin_unlock(&swap_lock);
 out:
 	return result;
 
@@ -1668,7 +1649,7 @@ get_swap_info_struct(unsigned type)
 }
 
 /*
- * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * swap_lock prevents swap_map being freed. Don't grab an extra
  * reference on the swaphandle, it doesn't matter if it becomes unused.
  */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
@@ -1684,7 +1665,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		toff++, i--;
 	*offset = toff;
 
-	swap_device_lock(swapdev);
+	spin_lock(&swap_lock);
 	do {
 		/* Don't read-ahead past the end of the swap area */
 		if (toff >= swapdev->max)
@@ -1697,6 +1678,6 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		toff++;
 		ret++;
 	} while (--i);
-	swap_device_unlock(swapdev);
+	spin_unlock(&swap_lock);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 3279ffd97f1b3962e40d3c5f09495ef8320b180b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:43 -0700
Subject: [PATCH] delete from_swap_cache BUG_ONs

Three of the four BUG_ONs in delete_from_swap_cache are immediately
repeated in __delete_from_swap_cache: delete those and add the one.  But
perhaps mm/ is altogether overprovisioned with historic BUGs?

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap_state.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4f251775ef9..029e56eb5e7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,6 +124,7 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
+	BUG_ON(PagePrivate(page));
 
 	radix_tree_delete(&swapper_space.page_tree, page->private);
 	page->private = 0;
@@ -196,11 +197,6 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	BUG_ON(!PageSwapCache(page));
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(PagePrivate(page));
-  
 	entry.val = page->private;
 
 	write_lock_irq(&swapper_space.tree_lock);
-- 
cgit v1.2.3-70-g09d2


From 839b9685e80592809d6dfdd865986cd1b5ddc2fb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:43 -0700
Subject: [PATCH] rmap: don't test rss

Remove the three get_mm_counter(mm, rss) tests from rmap.c: there was a
time when testing rss was important to avoid a particular race between
dup_mmap and the anonmm rmap; but now it's just a rather silly pseudo-
optimization, made even more obscure by the get_mm_counter macro.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index facb8cdca66..28c6cf96d3c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -289,8 +289,6 @@ static int page_referenced_one(struct page *page,
 	pte_t *pte;
 	int referenced = 0;
 
-	if (!get_mm_counter(mm, rss))
-		goto out;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
@@ -517,8 +515,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	pte_t pteval;
 	int ret = SWAP_AGAIN;
 
-	if (!get_mm_counter(mm, rss))
-		goto out;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
@@ -766,8 +762,7 @@ static int try_to_unmap_file(struct page *page)
 			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
-			while (get_mm_counter(vma->vm_mm, rss) &&
-				cursor < max_nl_cursor &&
+			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
 				try_to_unmap_cluster(cursor, &mapcount, vma);
 				cursor += CLUSTER_SIZE;
-- 
cgit v1.2.3-70-g09d2


From 6e21c8f145f5052c1c2fb4a4b41bee01c848159b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 3 Sep 2005 15:54:45 -0700
Subject: [PATCH] /proc/<pid>/numa_maps to show on which nodes pages reside

This patch was recently discussed on linux-mm:
http://marc.theaimsgroup.com/?t=112085728500002&r=1&w=2

I inherited a large code base from Ray for page migration.  There was a
small patch in there that I find to be very useful since it allows the
display of the locality of the pages in use by a process.  I reworked that
patch and came up with a /proc/<pid>/numa_maps that gives more information
about the vma's of a process.  numa_maps is indexes by the start address
found in /proc/<pid>/maps.  F.e.  with this patch you can see the page use
of the "getty" process:

margin:/proc/12008 # cat maps
00000000-00004000 r--p 00000000 00:00 0
2000000000000000-200000000002c000 r-xp 00000000 08:04 516                /lib/ld-2.3.3.so
2000000000038000-2000000000040000 rw-p 00028000 08:04 516                /lib/ld-2.3.3.so
2000000000040000-2000000000044000 rw-p 2000000000040000 00:00 0
2000000000058000-2000000000260000 r-xp 00000000 08:04 54707842           /lib/tls/libc.so.6.1
2000000000260000-2000000000268000 ---p 00208000 08:04 54707842           /lib/tls/libc.so.6.1
2000000000268000-2000000000274000 rw-p 00200000 08:04 54707842           /lib/tls/libc.so.6.1
2000000000274000-2000000000280000 rw-p 2000000000274000 00:00 0
2000000000280000-20000000002b4000 r--p 00000000 08:04 9126923            /usr/lib/locale/en_US.utf8/LC_CTYPE
2000000000300000-2000000000308000 r--s 00000000 08:04 60071467           /usr/lib/gconv/gconv-modules.cache
2000000000318000-2000000000328000 rw-p 2000000000318000 00:00 0
4000000000000000-4000000000008000 r-xp 00000000 08:04 29576399           /sbin/mingetty
6000000000004000-6000000000008000 rw-p 00004000 08:04 29576399           /sbin/mingetty
6000000000008000-600000000002c000 rw-p 6000000000008000 00:00 0          [heap]
60000fff7fffc000-60000fff80000000 rw-p 60000fff7fffc000 00:00 0
60000ffffff44000-60000ffffff98000 rw-p 60000ffffff44000 00:00 0          [stack]
a000000000000000-a000000000020000 ---p 00000000 00:00 0                  [vdso]

cat numa_maps
2000000000000000 default MaxRef=43 Pages=11 Mapped=11 N0=4 N1=3 N2=2 N3=2
2000000000038000 default MaxRef=1 Pages=2 Mapped=2 Anon=2 N0=2
2000000000040000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
2000000000058000 default MaxRef=43 Pages=61 Mapped=61 N0=14 N1=15 N2=16 N3=16
2000000000268000 default MaxRef=1 Pages=2 Mapped=2 Anon=2 N0=2
2000000000274000 default MaxRef=1 Pages=3 Mapped=3 Anon=3 N0=3
2000000000280000 default MaxRef=8 Pages=3 Mapped=3 N0=3
2000000000300000 default MaxRef=8 Pages=2 Mapped=2 N0=2
2000000000318000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N2=1
4000000000000000 default MaxRef=6 Pages=2 Mapped=2 N1=2
6000000000004000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
6000000000008000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
60000fff7fffc000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
60000ffffff44000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1

getty uses ld.so.  The first vma is the code segment which is used by 43
other processes and the pages are evenly distributed over the 4 nodes.

The second vma is the process specific data portion for ld.so.  This is
only one page.

The display format is:

<startaddress>	 Links to information in /proc/<pid>/map
<memory policy>  This can be "default" "interleave={}", "prefer=<node>" or "bind={<zones>}"
MaxRef=		<maximum reference to a page in this vma>
Pages=		<Nr of pages in use>
Mapped=		<Nr of pages with mapcount >
Anon=		<nr of anonymous pages>
Nx=		<Nr of pages on Node x>

The content of the proc-file is self-evident.  If this would be tied into
the sparsemem system then the contents of this file would not be too
useful.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c            |  35 ++++++++++++
 fs/proc/task_mmu.c        | 132 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mempolicy.h |   3 ++
 mm/mempolicy.c            |  12 ++---
 4 files changed, 176 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 491f2d9f89a..b796bf90a0b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@ enum pid_directory_inos {
 	PROC_TGID_STAT,
 	PROC_TGID_STATM,
 	PROC_TGID_MAPS,
+	PROC_TGID_NUMA_MAPS,
 	PROC_TGID_MOUNTS,
 	PROC_TGID_WCHAN,
 #ifdef CONFIG_SCHEDSTATS
@@ -102,6 +103,7 @@ enum pid_directory_inos {
 	PROC_TID_STAT,
 	PROC_TID_STATM,
 	PROC_TID_MAPS,
+	PROC_TID_NUMA_MAPS,
 	PROC_TID_MOUNTS,
 	PROC_TID_WCHAN,
 #ifdef CONFIG_SCHEDSTATS
@@ -144,6 +146,9 @@ static struct pid_entry tgid_base_stuff[] = {
 	E(PROC_TGID_STAT,      "stat",    S_IFREG|S_IRUGO),
 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
+#ifdef CONFIG_NUMA
+	E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO),
+#endif
 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
 #ifdef CONFIG_SECCOMP
 	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
@@ -180,6 +185,9 @@ static struct pid_entry tid_base_stuff[] = {
 	E(PROC_TID_STAT,       "stat",    S_IFREG|S_IRUGO),
 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
+#ifdef CONFIG_NUMA
+	E(PROC_TID_NUMA_MAPS,  "numa_maps",    S_IFREG|S_IRUGO),
+#endif
 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
 #ifdef CONFIG_SECCOMP
 	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
@@ -515,6 +523,27 @@ static struct file_operations proc_maps_operations = {
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_NUMA
+extern struct seq_operations proc_pid_numa_maps_op;
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+	struct task_struct *task = proc_task(inode);
+	int ret = seq_open(file, &proc_pid_numa_maps_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static struct file_operations proc_numa_maps_operations = {
+	.open		= numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 extern struct seq_operations mounts_op;
 static int mounts_open(struct inode *inode, struct file *file)
 {
@@ -1524,6 +1553,12 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_MAPS:
 			inode->i_fop = &proc_maps_operations;
 			break;
+#ifdef CONFIG_NUMA
+		case PROC_TID_NUMA_MAPS:
+		case PROC_TGID_NUMA_MAPS:
+			inode->i_fop = &proc_numa_maps_operations;
+			break;
+#endif
 		case PROC_TID_MEM:
 		case PROC_TGID_MEM:
 			inode->i_op = &proc_mem_inode_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 28b4a0253a9..64e84cadfa3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2,6 +2,8 @@
 #include <linux/hugetlb.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
 #include <asm/elf.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -233,3 +235,133 @@ struct seq_operations proc_pid_maps_op = {
 	.stop	= m_stop,
 	.show	= show_map
 };
+
+#ifdef CONFIG_NUMA
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long mapped;
+	unsigned long mapcount_max;
+	unsigned long node[MAX_NUMNODES];
+};
+
+/*
+ * Calculate numa node maps for a vma
+ */
+static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
+{
+	struct page *page;
+	unsigned long vaddr;
+	struct mm_struct *mm = vma->vm_mm;
+	int i;
+	struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
+
+	if (!md)
+		return NULL;
+	md->pages = 0;
+	md->anon = 0;
+	md->mapped = 0;
+	md->mapcount_max = 0;
+	for_each_node(i)
+		md->node[i] =0;
+
+	spin_lock(&mm->page_table_lock);
+ 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
+		page = follow_page(mm, vaddr, 0);
+		if (page) {
+			int count = page_mapcount(page);
+
+			if (count)
+				md->mapped++;
+			if (count > md->mapcount_max)
+				md->mapcount_max = count;
+			md->pages++;
+			if (PageAnon(page))
+				md->anon++;
+			md->node[page_to_nid(page)]++;
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+	return md;
+}
+
+static int show_numa_map(struct seq_file *m, void *v)
+{
+	struct task_struct *task = m->private;
+	struct vm_area_struct *vma = v;
+	struct mempolicy *pol;
+	struct numa_maps *md;
+	struct zone **z;
+	int n;
+	int first;
+
+	if (!vma->vm_mm)
+		return 0;
+
+	md = get_numa_maps(vma);
+	if (!md)
+		return 0;
+
+	seq_printf(m, "%08lx", vma->vm_start);
+	pol = get_vma_policy(task, vma, vma->vm_start);
+	/* Print policy */
+	switch (pol->policy) {
+	case MPOL_PREFERRED:
+		seq_printf(m, " prefer=%d", pol->v.preferred_node);
+		break;
+	case MPOL_BIND:
+		seq_printf(m, " bind={");
+		first = 1;
+		for (z = pol->v.zonelist->zones; *z; z++) {
+
+			if (!first)
+				seq_putc(m, ',');
+			else
+				first = 0;
+			seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
+					(*z)->name);
+		}
+		seq_putc(m, '}');
+		break;
+	case MPOL_INTERLEAVE:
+		seq_printf(m, " interleave={");
+		first = 1;
+		for_each_node(n) {
+			if (test_bit(n, pol->v.nodes)) {
+				if (!first)
+					seq_putc(m,',');
+				else
+					first = 0;
+				seq_printf(m, "%d",n);
+			}
+		}
+		seq_putc(m, '}');
+		break;
+	default:
+		seq_printf(m," default");
+		break;
+	}
+	seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
+			md->mapcount_max, md->pages, md->mapped);
+	if (md->anon)
+		seq_printf(m," Anon=%lu",md->anon);
+
+	for_each_online_node(n) {
+		if (md->node[n])
+			seq_printf(m, " N%d=%lu", n, md->node[n]);
+	}
+	seq_putc(m, '\n');
+	kfree(md);
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
+}
+
+struct seq_operations proc_pid_numa_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_numa_map
+};
+#endif
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8480aef10e6..94a46f38c53 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,9 @@ void mpol_free_shared_policy(struct shared_policy *p);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
+struct mempolicy *get_vma_policy(struct task_struct *task,
+			struct vm_area_struct *vma, unsigned long addr);
+
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b4eababc819..13492d66b7c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -664,10 +664,10 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-static struct mempolicy *
-get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
+struct mempolicy *
+get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol = task->mempolicy;
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -786,7 +786,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
 struct page *
 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	cpuset_update_current_mems_allowed();
 
@@ -908,7 +908,7 @@ void __mpol_free(struct mempolicy *p)
 /* Find first node suitable for an allocation */
 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
@@ -928,7 +928,7 @@ int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 /* Find secondary valid nodes for an allocation */
 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	switch (pol->policy) {
 	case MPOL_PREFERRED:
-- 
cgit v1.2.3-70-g09d2


From c3dce2d89c269d5373a120d4a22fc2426ec992b0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:46 -0700
Subject: [PATCH] mm: comment rmap

Just be clear that VM_RESERVED pages here are a bug, and the test is not there
because they are expected.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 28c6cf96d3c..f5a6966b7eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,6 +527,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	 * If the page is mlock()d, we cannot swap it out.
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
+	 *
+	 * Pages belonging to VM_RESERVED regions should not happen here.
 	 */
 	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 			ptep_clear_flush_young(vma, address, pte)) {
-- 
cgit v1.2.3-70-g09d2


From 2822c1aa574d277b9ba0130b1e71c1a5874bc04a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:47 -0700
Subject: [PATCH] mm: micro-optimise rmap

Microoptimise page_add_anon_rmap.  Although these expressions are used only in
the taken branch of the if() statement, the compiler can't reorder them inside
because atomic_inc_and_test is a barrier.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index f5a6966b7eb..7e975ca24c7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -439,22 +439,23 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	struct anon_vma *anon_vma = vma->anon_vma;
-	pgoff_t index;
-
 	BUG_ON(PageReserved(page));
-	BUG_ON(!anon_vma);
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
 
-	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-	index = (address - vma->vm_start) >> PAGE_SHIFT;
-	index += vma->vm_pgoff;
-	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
-		page->index = index;
+		struct anon_vma *anon_vma = vma->anon_vma;
+		pgoff_t index;
+
+		BUG_ON(!anon_vma);
+		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 		page->mapping = (struct address_space *) anon_vma;
+
+		index = (address - vma->vm_start) >> PAGE_SHIFT;
+		index += vma->vm_pgoff;
+		index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+		page->index = index;
+
 		inc_page_state(nr_mapped);
 	}
 	/* else checking page index and mapping is racy */
-- 
cgit v1.2.3-70-g09d2


From 4d7670e0f649f9e6e6ea6c8bb9f52441fa00f92b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:48 -0700
Subject: [PATCH] mm: cleanup rmap

Thanks to Bill Irwin for pointing this out.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 7e975ca24c7..450f5241b5a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -445,16 +445,12 @@ void page_add_anon_rmap(struct page *page,
 
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
-		pgoff_t index;
 
 		BUG_ON(!anon_vma);
 		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 		page->mapping = (struct address_space *) anon_vma;
 
-		index = (address - vma->vm_start) >> PAGE_SHIFT;
-		index += vma->vm_pgoff;
-		index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-		page->index = index;
+		page->index = linear_page_index(vma, address);
 
 		inc_page_state(nr_mapped);
 	}
-- 
cgit v1.2.3-70-g09d2


From 9a61c349b28ec5aef7e929236571fd770fdef0bb Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:49 -0700
Subject: [PATCH] mm: remap ZERO_PAGE mappings

filemap_xip's nopage routine maps the ZERO_PAGE into readonly mappings, if it
has no data page to map there: then if the hole in the file is later filled,
__xip_unmap uses an rmap technique to replace the ZERO_PAGEs mapped for that
offset by the newly allocated file page, so that established mappings will see
the newly written data.

However, on MIPS (alone) there's not one but as many as eight ZERO_PAGEs,
chosen for coloring by user virtual address; and if mremap has meanwhile been
used to move a mapping containing a ZERO_PAGE, it will generally not match the
ZERO_PAGE(address) __xip_unmap is looking for.

To maintain XIP's established mappings correctly on MIPS, we need Nick's fix
to mremap's move_one_page (originally presented as an optimization), to
replace the ZERO_PAGE appropriate to the old address by the ZERO_PAGE
appropriate to the new address.

(But when I first saw this, I was thinking the ZERO_PAGEs themselves would get
corrupted, very bad.  Now I think it's the other way round, that the
established mappings will fail to see the newly written data: incorrect, but
not corrupting everything else.  Whether filemap_xip's technique is generally
safe, I'd hesitate to say in a hurry: it's interesting, but we've never tried
to do that in tmpfs.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mremap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/mremap.c b/mm/mremap.c
index fc45dc9a617..a32fed454bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -141,6 +141,10 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
 			if (dst) {
 				pte_t pte;
 				pte = ptep_clear_flush(vma, old_addr, src);
+				/* ZERO_PAGE can be dependant on virtual addr */
+				if (pfn_valid(pte_pfn(pte)) &&
+					pte_page(pte) == ZERO_PAGE(old_addr))
+					pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
 				set_pte_at(mm, new_addr, dst, pte);
 			} else
 				error = -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 242e54686257493f0b10ac557e730419d9af7d24 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:50 -0700
Subject: [PATCH] mm: remove atomic

This bitop does not need to be atomic because it is performed when there will
be no references to the page (ie.  the page is being freed).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 1 +
 mm/page_alloc.c            | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f5a6695d4d2..99f7cc49506 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,6 +194,7 @@ extern void __mod_page_state(unsigned long offset, unsigned long delta);
 #define SetPageDirty(page)	set_bit(PG_dirty, &(page)->flags)
 #define TestSetPageDirty(page)	test_and_set_bit(PG_dirty, &(page)->flags)
 #define ClearPageDirty(page)	clear_bit(PG_dirty, &(page)->flags)
+#define __ClearPageDirty(page)	__clear_bit(PG_dirty, &(page)->flags)
 #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags)
 
 #define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d088371196..620aa11b24e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -329,7 +329,7 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_writeback )))
 		bad_page(function, page);
 	if (PageDirty(page))
-		ClearPageDirty(page);
+		__ClearPageDirty(page);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From bce5f6ba340b09d8b29902add204bb95a6d3d88b Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Sat, 3 Sep 2005 15:54:50 -0700
Subject: [PATCH] VM: add capabilites check to set_zone_reclaim

Add a capability check to sys_set_zone_reclaim().  This syscall is not
something that should be available to a user.

Signed-off-by:  Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/capability.h | 1 +
 mm/vmscan.c                | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'mm')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 8d139f4acf2..6b4618902d3 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -233,6 +233,7 @@ typedef __u32 kernel_cap_t;
 /* Allow enabling/disabling tagged queuing on SCSI controllers and sending
    arbitrary SCSI commands */
 /* Allow setting encryption key on loopback filesystem */
+/* Allow setting zone reclaim policy */
 
 #define CAP_SYS_ADMIN        21
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfffe5098d5..ab631a3c62c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1375,6 +1375,9 @@ asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
 	struct zone *z;
 	int i;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (node >= MAX_NUMNODES || !node_online(node))
 		return -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From 53e9a6159fdc6419874ce4d86d3577dbedc77b62 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@bork.org>
Date: Sat, 3 Sep 2005 15:54:51 -0700
Subject: [PATCH] VM: zone reclaim atomic ops cleanup

Christoph Lameter and Marcelo Tosatti asked to get rid of the
atomic_inc_and_test() to cleanup the atomic ops in the zone reclaim code.

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 +-
 mm/vmscan.c     | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 620aa11b24e..d157dae8c9f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1909,7 +1909,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, -1);
+		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab631a3c62c..0095533cdde 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -822,6 +822,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 
+	atomic_inc(&zone->reclaim_in_progress);
+
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
@@ -861,6 +863,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	}
 
 	throttle_vm_writeout();
+
+	atomic_dec(&zone->reclaim_in_progress);
 }
 
 /*
@@ -900,9 +904,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		atomic_inc(&zone->reclaim_in_progress);
 		shrink_zone(zone, sc);
-		atomic_dec(&zone->reclaim_in_progress);
 	}
 }
  
@@ -1358,14 +1360,13 @@ int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
 	/* Don't reclaim the zone if there are other reclaimers active */
-	if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+	if (atomic_read(&zone->reclaim_in_progress) > 0)
 		goto out;
 
 	shrink_zone(zone, &sc);
 	total_reclaimed = sc.nr_reclaimed;
 
  out:
-	atomic_dec(&zone->reclaim_in_progress);
 	return total_reclaimed;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 836d5ffd34550901ea024347693e689273ded8aa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:53 -0700
Subject: [PATCH] mm: fix madvise vma merging

Better late than never, I've at last reviewed the madvise vma merging
going into 2.6.13.  Remove a pointless check and fix two little bugs -
a simple test (with /proc/<pid>/maps hacked to show ReadHints) showed
both mismerges in practice: though being madvise, neither was disastrous.

1. Correct placement of the success label in madvise_behavior: as in
   mprotect_fixup and mlock_fixup, it is necessary to update vm_flags
   when vma_merge succeeds (to handle the exceptional Case 8 noted in
   the comments above vma_merge itself).

2. Correct initial value of prev when starting part way into a vma: as
   in sys_mprotect and do_mlock, it needs to be set to vma in this case
   (vma_merge handles only that minimum of cases shown in its comments).

3. If find_vma_prev sets prev, then the vma it returns is prev->vm_next,
   so it's pointless to make that same assignment again in sys_madvise.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index c8c01a12fea..4454936f87d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,7 +37,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 
 	if (new_flags == vma->vm_flags) {
 		*prev = vma;
-		goto success;
+		goto out;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -62,6 +62,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 			goto out;
 	}
 
+success:
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
@@ -70,7 +71,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
 out:
 	if (error == -ENOMEM)
 		error = -EAGAIN;
-success:
 	return error;
 }
 
@@ -237,8 +237,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 	 * - different from the way of handling in mlock etc.
 	 */
 	vma = find_vma_prev(current->mm, start, &prev);
-	if (!vma && prev)
-		vma = prev->vm_next;
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 0abf40c1ac3f25d264c019e1cfe155d590defb87 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Sat, 3 Sep 2005 15:54:54 -0700
Subject: [PATCH] vm: slab.c spelling correction

Fix a small spelling mistake.  subtile->subtle

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index c9e706db463..ae6cca04de4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -600,7 +600,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size,
 		csizep++;
 
 	/*
-	 * Really subtile: The last entry with cs->cs_size==ULONG_MAX
+	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 	 * has cs_{dma,}cachep==NULL. Thus no special case
 	 * for large kmalloc calls required.
 	 */
-- 
cgit v1.2.3-70-g09d2


From d44ed4f86892e350f4b16a3489b7e7c1a9bb7ead Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:54:55 -0700
Subject: [PATCH] shmem_populate: avoid an useless check, and some comments

Either shmem_getpage returns a failure, or it found a page, or it was told
it couldn't do any I/O.  So it's useless to check nonblock in the else
branch.  We could add a BUG() there but I preferred to comment the
offending function.

This was taken out from one Ingo Molnar's old patch I'm resurrecting.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 7 +++++++
 mm/shmem.c   | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index edc54436fa9..88611928e71 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1504,8 +1504,12 @@ repeat:
 		return -EINVAL;
 
 	page = filemap_getpage(file, pgoff, nonblock);
+
+	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
+	 * done in shmem_populate calling shmem_getpage */
 	if (!page && !nonblock)
 		return -ENOMEM;
+
 	if (page) {
 		err = install_page(mm, vma, addr, page, prot);
 		if (err) {
@@ -1513,6 +1517,9 @@ repeat:
 			return err;
 		}
 	} else {
+		/* No page was found just because we can't read it in now (being
+		 * here implies nonblock != 0), but the page may exist, so set
+		 * the PTE to fault it in later. */
 		err = install_file_pte(mm, vma, addr, pgoff, prot);
 		if (err)
 			return err;
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a81b1ee4f7..08a3bc2fba6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1195,6 +1195,7 @@ static int shmem_populate(struct vm_area_struct *vma,
 		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
 		if (err)
 			return err;
+		/* Page may still be null, but only if nonblock was set. */
 		if (page) {
 			mark_page_accessed(page);
 			err = install_page(mm, vma, addr, page, prot);
@@ -1202,7 +1203,10 @@ static int shmem_populate(struct vm_area_struct *vma,
 				page_cache_release(page);
 				return err;
 			}
-		} else if (nonblock) {
+		} else {
+			/* No page was found just because we can't read it in
+			 * now (being here implies nonblock != 0), but the page
+			 * may exist, so set the PTE to fault it in later. */
     			err = install_file_pte(mm, vma, addr, pgoff, prot);
 			if (err)
 	    			return err;
-- 
cgit v1.2.3-70-g09d2


From 4944e76d81801b8e60ed3e7789443f210c16ed65 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:54:56 -0700
Subject: [PATCH] mm: remove implied vm_ops check

If !vma->vm-ops we already BUG above, so retesting it is useless.  The
compiler cannot optimize this because BUG is a macro and is not thus marked
noreturn; that should possibly be fixed.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index a596c117224..b25f5e58a14 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1955,7 +1955,7 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	 * Fall back to the linear mapping if the fs does not support
 	 * ->populate:
 	 */
-	if (!vma->vm_ops || !vma->vm_ops->populate || 
+	if (!vma->vm_ops->populate ||
 			(write_access && !(vma->vm_flags & VM_SHARED))) {
 		pte_clear(mm, address, pte);
 		return do_no_page(mm, vma, address, write_access, pte, pmd);
-- 
cgit v1.2.3-70-g09d2


From fd195c49fb17a21e232f50bddb2267150053cf34 Mon Sep 17 00:00:00 2001
From: Deepak Saxena <dsaxena@plexity.net>
Date: Sat, 3 Sep 2005 15:54:58 -0700
Subject: [PATCH] arm: allow for arch-specific IOREMAP_MAX_ORDER

Version 6 of the ARM architecture introduces the concept of 16MB pages
(supersections) and 36-bit (40-bit actually, but nobody uses this) physical
addresses.  36-bit addressed memory and I/O and ARMv6 can only be mapped
using supersections and the requirement on these is that both virtual and
physical addresses be 16MB aligned.  In trying to add support for ioremap()
of 36-bit I/O, we run into the issue that get_vm_area() allows for a
maximum of 512K alignment via the IOREMAP_MAX_ORDER constant.  To work
around this, we can:

- Allocate a larger VM area than needed (size + (1ul << IOREMAP_MAX_ORDER))
  and then align the pointer ourselves, but this ends up with 512K of
  wasted VM per ioremap().

- Provide a new __get_vm_area_aligned() API and make __get_vm_area() sit
  on top of this. I did this and it works but I don't like the idea
  adding another VM API just for this one case.

- My preferred solution which is to allow the architecture to override
  the IOREMAP_MAX_ORDER constant with it's own version.

Signed-off-by: Deepak Saxena <dsaxena@plexity.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmalloc.h | 8 ++++++++
 mm/vmalloc.c            | 2 --
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6409d9cf596..b244f69ef68 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -10,6 +10,14 @@
 #define VM_MAP		0x00000004	/* vmap()ed pages */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
+/*
+ * Maximum alignment for ioremap() regions.
+ * Can be overriden by arch-specific value.
+ */
+#ifndef IOREMAP_MAX_ORDER
+#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
+#endif
+
 struct vm_struct {
 	void			*addr;
 	unsigned long		size;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8ff16a1eee6..67b358e57ef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -158,8 +158,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 	return err;
 }
 
-#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
-
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 				unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3-70-g09d2


From 7bf07f3d4b4358aa6d99a26d7a0165f1e91c3fcc Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 3 Sep 2005 15:55:00 -0700
Subject: [PATCH] hugetlb: move stale pte check into huge_pte_alloc()

Initial Post (Wed, 17 Aug 2005)

This patch moves the
	if (! pte_none(*pte))
		hugetlb_clean_stale_pgtable(pte);
logic into huge_pte_alloc() so all of its callers can be immune to the bug
described by Kenneth Chen at http://lkml.org/lkml/2004/6/16/246

> It turns out there is a bug in hugetlb_prefault(): with 3 level page table,
> huge_pte_alloc() might return a pmd that points to a PTE page. It happens
> if the virtual address for hugetlb mmap is recycled from previously used
> normal page mmap. free_pgtables() might not scrub the pmd entry on
> munmap and hugetlb_prefault skips on any pmd presence regardless what type
> it is.

Unless I am missing something, it seems more correct to place the check inside
huge_pte_alloc() to prevent a the same bug wherever a huge pte is allocated.
It also allows checking for this condition when lazily faulting huge pages
later in the series.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c | 13 +++++++++++--
 mm/hugetlb.c               |  2 --
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 3b099f32b94..57c486f0e89 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -22,12 +22,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	pmd = pmd_alloc(mm, pud, addr);
-	return (pte_t *) pmd;
+
+	if (!pmd)
+		goto out;
+
+	pte = (pte_t *) pmd;
+	if (!pte_none(*pte) && !pte_huge(*pte))
+		hugetlb_clean_stale_pgtable(pte);
+out:
+	return pte;
 }
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6bf720bc662..901ac523a1c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -360,8 +360,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 			ret = -ENOMEM;
 			goto out;
 		}
-		if (! pte_none(*pte))
-			hugetlb_clean_stale_pgtable(pte);
 
 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-- 
cgit v1.2.3-70-g09d2


From fa5b08d5f818063d18433194f20359ef2ae50254 Mon Sep 17 00:00:00 2001
From: Kyle Moffett <mrmacman_g4@mac.com>
Date: Sat, 3 Sep 2005 15:55:03 -0700
Subject: [PATCH] sab: consolidate kmem_bufctl_t

This is used only in slab.c and each architecture gets to define whcih
underlying type is to be used.

Seems a bit silly - move it to slab.c and use the same type for all
architectures: unsigned int.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/types.h   | 2 --
 include/asm-arm/types.h     | 2 --
 include/asm-arm26/types.h   | 2 --
 include/asm-cris/types.h    | 2 --
 include/asm-frv/types.h     | 2 --
 include/asm-h8300/types.h   | 2 --
 include/asm-i386/types.h    | 2 --
 include/asm-ia64/types.h    | 2 --
 include/asm-m32r/types.h    | 2 --
 include/asm-m68k/types.h    | 2 --
 include/asm-mips/types.h    | 2 --
 include/asm-parisc/types.h  | 2 --
 include/asm-ppc/types.h     | 2 --
 include/asm-ppc64/types.h   | 1 -
 include/asm-s390/types.h    | 2 --
 include/asm-sh/types.h      | 2 --
 include/asm-sh64/types.h    | 2 --
 include/asm-sparc/types.h   | 2 --
 include/asm-sparc64/types.h | 2 --
 include/asm-v850/types.h    | 2 --
 include/asm-x86_64/types.h  | 2 --
 include/asm-xtensa/types.h  | 2 --
 mm/slab.c                   | 1 +
 23 files changed, 1 insertion(+), 43 deletions(-)

(limited to 'mm')

diff --git a/include/asm-alpha/types.h b/include/asm-alpha/types.h
index 43264d21924..f5716139ec8 100644
--- a/include/asm-alpha/types.h
+++ b/include/asm-alpha/types.h
@@ -56,8 +56,6 @@ typedef unsigned long u64;
 typedef u64 dma_addr_t;
 typedef u64 dma64_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_TYPES_H */
diff --git a/include/asm-arm/types.h b/include/asm-arm/types.h
index f4c92e4c8c0..22992ee0627 100644
--- a/include/asm-arm/types.h
+++ b/include/asm-arm/types.h
@@ -52,8 +52,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u32 dma64_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-arm26/types.h b/include/asm-arm26/types.h
index 56cbe573a23..81bd357ada0 100644
--- a/include/asm-arm26/types.h
+++ b/include/asm-arm26/types.h
@@ -52,8 +52,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u32 dma64_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-cris/types.h b/include/asm-cris/types.h
index 8fa6d6c7afc..84557c9bac9 100644
--- a/include/asm-cris/types.h
+++ b/include/asm-cris/types.h
@@ -52,8 +52,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u32 dma64_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-frv/types.h b/include/asm-frv/types.h
index 1a5b6546bb4..50605df6d8a 100644
--- a/include/asm-frv/types.h
+++ b/include/asm-frv/types.h
@@ -65,8 +65,6 @@ typedef u64 u_quad_t;
 
 typedef u32 dma_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-h8300/types.h b/include/asm-h8300/types.h
index 21f4fc07ac0..bf91e0d4dde 100644
--- a/include/asm-h8300/types.h
+++ b/include/asm-h8300/types.h
@@ -58,8 +58,6 @@ typedef u32 dma_addr_t;
 #define HAVE_SECTOR_T
 typedef u64 sector_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/asm-i386/types.h b/include/asm-i386/types.h
index 901b77c42b8..ced00fe8fe6 100644
--- a/include/asm-i386/types.h
+++ b/include/asm-i386/types.h
@@ -63,8 +63,6 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-ia64/types.h b/include/asm-ia64/types.h
index a677565aa95..902850d1242 100644
--- a/include/asm-ia64/types.h
+++ b/include/asm-ia64/types.h
@@ -67,8 +67,6 @@ typedef __u64 u64;
 
 typedef u64 dma_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 # endif /* __KERNEL__ */
 #endif /* !__ASSEMBLY__ */
 
diff --git a/include/asm-m32r/types.h b/include/asm-m32r/types.h
index ca0a887d223..fcf24c64c3b 100644
--- a/include/asm-m32r/types.h
+++ b/include/asm-m32r/types.h
@@ -55,8 +55,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u64 dma64_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-m68k/types.h b/include/asm-m68k/types.h
index f391cbe39b9..b5a1febc97d 100644
--- a/include/asm-m68k/types.h
+++ b/include/asm-m68k/types.h
@@ -60,8 +60,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u32 dma64_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-mips/types.h b/include/asm-mips/types.h
index d2f0c76b00a..b949ab33e8e 100644
--- a/include/asm-mips/types.h
+++ b/include/asm-mips/types.h
@@ -99,8 +99,6 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-parisc/types.h b/include/asm-parisc/types.h
index 8fe7a44ea20..d21b9d0d63e 100644
--- a/include/asm-parisc/types.h
+++ b/include/asm-parisc/types.h
@@ -56,8 +56,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u64 dma64_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-ppc/types.h b/include/asm-ppc/types.h
index a787bc03258..77dc24d7d2a 100644
--- a/include/asm-ppc/types.h
+++ b/include/asm-ppc/types.h
@@ -62,8 +62,6 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-ppc64/types.h b/include/asm-ppc64/types.h
index 5b8c2cfa113..bf294c1761b 100644
--- a/include/asm-ppc64/types.h
+++ b/include/asm-ppc64/types.h
@@ -72,7 +72,6 @@ typedef struct {
 	unsigned long env;
 } func_descr_t;
 
-typedef unsigned int kmem_bufctl_t;
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-s390/types.h b/include/asm-s390/types.h
index 3fefd61416a..d0be3e47701 100644
--- a/include/asm-s390/types.h
+++ b/include/asm-s390/types.h
@@ -79,8 +79,6 @@ typedef unsigned  long u64;
 
 typedef u32 dma_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #ifndef __s390x__
 typedef union {
 	unsigned long long pair;
diff --git a/include/asm-sh/types.h b/include/asm-sh/types.h
index c4dc126c562..cb7e183a0a6 100644
--- a/include/asm-sh/types.h
+++ b/include/asm-sh/types.h
@@ -58,8 +58,6 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-sh64/types.h b/include/asm-sh64/types.h
index 41d4d2f82aa..8d41db2153b 100644
--- a/include/asm-sh64/types.h
+++ b/include/asm-sh64/types.h
@@ -65,8 +65,6 @@ typedef u32 dma_addr_t;
 #endif
 typedef u64 dma64_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #define BITS_PER_LONG 32
diff --git a/include/asm-sparc/types.h b/include/asm-sparc/types.h
index 9eabf6e61cc..42fc6ed9815 100644
--- a/include/asm-sparc/types.h
+++ b/include/asm-sparc/types.h
@@ -54,8 +54,6 @@ typedef unsigned long long u64;
 typedef u32 dma_addr_t;
 typedef u32 dma64_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-sparc64/types.h b/include/asm-sparc64/types.h
index 6248ed1a9a7..d0ee7f10583 100644
--- a/include/asm-sparc64/types.h
+++ b/include/asm-sparc64/types.h
@@ -56,8 +56,6 @@ typedef unsigned long u64;
 typedef u32 dma_addr_t;
 typedef u64 dma64_addr_t;
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-v850/types.h b/include/asm-v850/types.h
index e7cfe5b33a1..dcef5719687 100644
--- a/include/asm-v850/types.h
+++ b/include/asm-v850/types.h
@@ -59,8 +59,6 @@ typedef unsigned long long u64;
 
 typedef u32 dma_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-x86_64/types.h b/include/asm-x86_64/types.h
index 32bd1426b52..c86c2e6793e 100644
--- a/include/asm-x86_64/types.h
+++ b/include/asm-x86_64/types.h
@@ -51,8 +51,6 @@ typedef u64 dma_addr_t;
 typedef u64 sector_t;
 #define HAVE_SECTOR_T
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-xtensa/types.h b/include/asm-xtensa/types.h
index ebac0046985..9d99a8e9e33 100644
--- a/include/asm-xtensa/types.h
+++ b/include/asm-xtensa/types.h
@@ -58,8 +58,6 @@ typedef unsigned long long u64;
 
 typedef u32 dma_addr_t;
 
-typedef unsigned int kmem_bufctl_t;
-
 #endif	/* __KERNEL__ */
 #endif
 
diff --git a/mm/slab.c b/mm/slab.c
index ae6cca04de4..59d382fbca1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -189,6 +189,7 @@
  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 
+typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
-- 
cgit v1.2.3-70-g09d2


From a600388d28419305aad3c4c0af52c223cf6fa0af Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:55:04 -0700
Subject: [PATCH] x86: ptep_clear optimization

Add a new accessor for PTEs, which passes the full hint from the mmu_gather
struct; this allows architectures with hardware pagetables to optimize away
atomic PTE operations when destroying an address space.  Removing the
locked operation should allow better pipelining of memory access in this
loop.  I measured an average savings of 30-35 cycles per zap_pte_range on
the first 500 destructions on Pentium-M, but I believe the optimization
would win more on older processors which still assert the bus lock on xchg
for an exclusive cacheline.

Update: I made some new measurements, and this saves exactly 26 cycles over
ptep_get_and_clear on Pentium M.  On P4, with a PAE kernel, this saves 180
cycles per ptep_get_and_clear, for a whopping 92160 cycles savings for a
full address space destruction.

pte_clear_full is not yet used, but is provided for future optimizations
(in particular, when running inside of a hypervisor that queues page table
updates, the full hint allows us to avoid queueing unnecessary page table
update for an address space in the process of being destroyed.

This is not a huge win, but it does help a bit, and sets the stage for
further hypervisor optimization of the mm layer on all architectures.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/pgtable.h | 16 ++++++++++++++++
 include/asm-i386/pgtable.h    | 13 +++++++++++++
 mm/memory.c                   |  5 +++--
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f4059356517..f86c1e54946 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -101,6 +101,22 @@ do {				  					  \
 })
 #endif
 
+#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+#define ptep_get_and_clear_full(__mm, __address, __ptep, __full)	\
+({									\
+	pte_t __pte;							\
+	__pte = ptep_get_and_clear((__mm), (__address), (__ptep));	\
+	__pte;								\
+})
+#endif
+
+#ifndef __HAVE_ARCH_PTE_CLEAR_FULL
+#define pte_clear_full(__mm, __address, __ptep, __full)			\
+do {									\
+	pte_clear((__mm), (__address), (__ptep));			\
+} while (0)
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
 #define ptep_clear_flush(__vma, __address, __ptep)			\
 ({									\
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index f51fd2c956b..d74185aee15 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -260,6 +260,18 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned
 	return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
 }
 
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
+{
+	pte_t pte;
+	if (full) {
+		pte = *ptep;
+		*ptep = __pte(0);
+	} else {
+		pte = ptep_get_and_clear(mm, addr, ptep);
+	}
+	return pte;
+}
+
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
@@ -417,6 +429,7 @@ extern void noexec_setup(const char *str);
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
 #include <asm-generic/pgtable.h>
diff --git a/mm/memory.c b/mm/memory.c
index b25f5e58a14..788a6281034 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -562,7 +562,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear(tlb->mm, addr, pte);
+			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
@@ -590,7 +591,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear(tlb->mm, addr, pte);
+		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
 }
-- 
cgit v1.2.3-70-g09d2


From 34342e863c3143640c031760140d640a06c6a5f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 3 Sep 2005 15:55:06 -0700
Subject: [PATCH] mm/slab.c: prefetchw the start of new allocated objects

Mostobjects returned by __cache_alloc() will be written by the caller,
(but not all callers want to write all the object, but just at the
begining) prefetchw() tells the modern CPU to think about the future
writes, ie start some memory transactions in advance.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 59d382fbca1..75127a6f1fd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2166,7 +2166,9 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
 		objp = cache_alloc_refill(cachep, flags);
 	}
 	local_irq_restore(save_flags);
-	objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
+	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
+					__builtin_return_address(0));
+	prefetchw(objp);
 	return objp;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 00e145b6d59a16dd7740197a18f7abdb3af004a9 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sat, 3 Sep 2005 15:55:07 -0700
Subject: [PATCH] slab: removes local_irq_save()/local_irq_restore() pair

Proposed by and based on a patch from Eric Dumazet <dada1@cosmosbay.com>:
This patch removes unnecessary critical section in ksize() function, as
cli/sti are rather expensive on modern CPUS.

It additionally adds a docbook entry for ksize() and further simplifies the
code.

Signed-Off-By: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 75127a6f1fd..a9ff4f7f986 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3076,20 +3076,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 }
 #endif
 
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ */
 unsigned int ksize(const void *objp)
 {
-	kmem_cache_t *c;
-	unsigned long flags;
-	unsigned int size = 0;
-
-	if (likely(objp != NULL)) {
-		local_irq_save(flags);
-		c = GET_PAGE_CACHE(virt_to_page(objp));
-		size = kmem_cache_size(c);
-		local_irq_restore(flags);
-	}
+	if (unlikely(objp == NULL))
+		return 0;
 
-	return size;
+	return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From c07e02db76940c75fc92f2f2c9adcdbb09ed70d0 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Sat, 3 Sep 2005 15:55:11 -0700
Subject: [PATCH] VM: add page_state info to per-node meminfo

Add page_state info to the per-node meminfo file in sysfs.  This is mostly
just for informational purposes.

The lack of this information was brought up recently during a discussion
regarding pagecache clearing, and I put this patch together to test out one
of the suggestions.

It seems like interesting info to have, so I'm submitting the patch.

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c        | 24 ++++++++++++++++++++++--
 include/linux/page-flags.h |  1 +
 mm/page_alloc.c            | 25 ++++++++++++++++++++-----
 3 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 904b27caf69..16c513aa4d4 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -39,13 +39,25 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 	int n;
 	int nid = dev->id;
 	struct sysinfo i;
+	struct page_state ps;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long free;
 
 	si_meminfo_node(&i, nid);
+	get_page_state_node(&ps, nid);
 	__get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
 
+	/* Check for negative values in these approximate counters */
+	if ((long)ps.nr_dirty < 0)
+		ps.nr_dirty = 0;
+	if ((long)ps.nr_writeback < 0)
+		ps.nr_writeback = 0;
+	if ((long)ps.nr_mapped < 0)
+		ps.nr_mapped = 0;
+	if ((long)ps.nr_slab < 0)
+		ps.nr_slab = 0;
+
 	n = sprintf(buf, "\n"
 		       "Node %d MemTotal:     %8lu kB\n"
 		       "Node %d MemFree:      %8lu kB\n"
@@ -55,7 +67,11 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d HighTotal:    %8lu kB\n"
 		       "Node %d HighFree:     %8lu kB\n"
 		       "Node %d LowTotal:     %8lu kB\n"
-		       "Node %d LowFree:      %8lu kB\n",
+		       "Node %d LowFree:      %8lu kB\n"
+		       "Node %d Dirty:        %8lu kB\n"
+		       "Node %d Writeback:    %8lu kB\n"
+		       "Node %d Mapped:       %8lu kB\n"
+		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
 		       nid, K(i.totalram - i.freeram),
@@ -64,7 +80,11 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(i.totalhigh),
 		       nid, K(i.freehigh),
 		       nid, K(i.totalram - i.totalhigh),
-		       nid, K(i.freeram - i.freehigh));
+		       nid, K(i.freeram - i.freehigh),
+		       nid, K(ps.nr_dirty),
+		       nid, K(ps.nr_writeback),
+		       nid, K(ps.nr_mapped),
+		       nid, K(ps.nr_slab));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
 }
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 99f7cc49506..f34767c5fc7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -134,6 +134,7 @@ struct page_state {
 };
 
 extern void get_page_state(struct page_state *ret);
+extern void get_page_state_node(struct page_state *ret, int node);
 extern void get_full_page_state(struct page_state *ret);
 extern unsigned long __read_page_state(unsigned long offset);
 extern void __mod_page_state(unsigned long offset, unsigned long delta);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d157dae8c9f..b06a9636d97 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1130,19 +1130,20 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
 
-void __get_page_state(struct page_state *ret, int nr)
+void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
 	memset(ret, 0, sizeof(*ret));
+	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
-	cpu = first_cpu(cpu_online_map);
+	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
-		cpu = next_cpu(cpu, cpu_online_map);
+		cpu = next_cpu(cpu, *cpumask);
 
 		if (cpu < NR_CPUS)
 			prefetch(&per_cpu(page_states, cpu));
@@ -1153,19 +1154,33 @@ void __get_page_state(struct page_state *ret, int nr)
 	}
 }
 
+void get_page_state_node(struct page_state *ret, int node)
+{
+	int nr;
+	cpumask_t mask = node_to_cpumask(node);
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr+1, &mask);
+}
+
 void get_page_state(struct page_state *ret)
 {
 	int nr;
+	cpumask_t mask = CPU_MASK_ALL;
 
 	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 	nr /= sizeof(unsigned long);
 
-	__get_page_state(ret, nr + 1);
+	__get_page_state(ret, nr + 1, &mask);
 }
 
 void get_full_page_state(struct page_state *ret)
 {
-	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+	cpumask_t mask = CPU_MASK_ALL;
+
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
 unsigned long __read_page_state(unsigned long offset)
-- 
cgit v1.2.3-70-g09d2


From f549d6c18c0e8e6cf1bf0e7a47acc1daf7e2cec1 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Sat, 3 Sep 2005 15:55:18 -0700
Subject: [PATCH] Generic VFS fallback for security xattrs

This patch modifies the VFS setxattr, getxattr, and listxattr code to fall
back to the security module for security xattrs if the filesystem does not
support xattrs natively.  This allows security modules to export the incore
inode security label information to userspace even if the filesystem does
not provide xattr storage, and eliminates the need to individually patch
various pseudo filesystem types to provide such access.  The patch removes
the existing xattr code from devpts and tmpfs as it is then no longer
needed.

The patch restructures the code flow slightly to reduce duplication between
the normal path and the fallback path, but this should only have one
user-visible side effect - a program may get -EACCES rather than
-EOPNOTSUPP if policy denied access but the filesystem didn't support the
operation anyway.  Note that the post_setxattr hook call is not needed in
the fallback case, as the inode_setsecurity hook call handles the incore
inode security state update directly.  In contrast, we do call fsnotify in
both cases.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig                 | 43 -----------------------
 fs/devpts/Makefile         |  1 -
 fs/devpts/inode.c          | 21 ------------
 fs/devpts/xattr_security.c | 47 -------------------------
 fs/xattr.c                 | 80 ++++++++++++++++++++++++++-----------------
 mm/shmem.c                 | 85 ----------------------------------------------
 6 files changed, 49 insertions(+), 228 deletions(-)
 delete mode 100644 fs/devpts/xattr_security.c

(limited to 'mm')

diff --git a/fs/Kconfig b/fs/Kconfig
index e54be705835..ed78d24ee42 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -783,28 +783,6 @@ config SYSFS
 
 	Designers of embedded systems may wish to say N here to conserve space.
 
-config DEVPTS_FS_XATTR
-	bool "/dev/pts Extended Attributes"
-	depends on UNIX98_PTYS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config DEVPTS_FS_SECURITY
-	bool "/dev/pts Security Labels"
-	depends on DEVPTS_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the /dev/pts filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
 config TMPFS
 	bool "Virtual memory file system support (former shm fs)"
 	help
@@ -817,27 +795,6 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
-config TMPFS_XATTR
-	bool "tmpfs Extended Attributes"
-	depends on TMPFS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config TMPFS_SECURITY
-	bool "tmpfs Security Labels"
-	depends on TMPFS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the tmpfs filesystem.
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN
diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile
index 5800df2e50c..236696efcba 100644
--- a/fs/devpts/Makefile
+++ b/fs/devpts/Makefile
@@ -5,4 +5,3 @@
 obj-$(CONFIG_UNIX98_PTYS)		+= devpts.o
 
 devpts-$(CONFIG_UNIX98_PTYS)		:= inode.o
-devpts-$(CONFIG_DEVPTS_FS_SECURITY)	+= xattr_security.o
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1571c8d6c23..f2be44d4491 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,28 +18,9 @@
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/devpts_fs.h>
-#include <linux/xattr.h>
 
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
-extern struct xattr_handler devpts_xattr_security_handler;
-
-static struct xattr_handler *devpts_xattr_handlers[] = {
-#ifdef CONFIG_DEVPTS_FS_SECURITY
-	&devpts_xattr_security_handler,
-#endif
-	NULL
-};
-
-static struct inode_operations devpts_file_inode_operations = {
-#ifdef CONFIG_DEVPTS_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= generic_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-};
-
 static struct vfsmount *devpts_mnt;
 static struct dentry *devpts_root;
 
@@ -102,7 +83,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
 	s->s_op = &devpts_sops;
-	s->s_xattr = devpts_xattr_handlers;
 	s->s_time_gran = 1;
 
 	inode = new_inode(s);
@@ -175,7 +155,6 @@ int devpts_pty_new(struct tty_struct *tty)
 	inode->i_gid = config.setgid ? config.gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|config.mode, device);
-	inode->i_op = &devpts_file_inode_operations;
 	inode->u.generic_ip = tty;
 
 	dentry = get_node(number);
diff --git a/fs/devpts/xattr_security.c b/fs/devpts/xattr_security.c
deleted file mode 100644
index 864cb5c79ba..00000000000
--- a/fs/devpts/xattr_security.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Security xattr support for devpts.
- *
- * Author: Stephen Smalley <sds@epoch.ncsc.mil>
- * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/security.h>
-#include <linux/xattr.h>
-
-static size_t
-devpts_xattr_security_list(struct inode *inode, char *list, size_t list_len,
-			   const char *name, size_t name_len)
-{
-	return security_inode_listsecurity(inode, list, list_len);
-}
-
-static int
-devpts_xattr_security_get(struct inode *inode, const char *name,
-			  void *buffer, size_t size)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_getsecurity(inode, name, buffer, size);
-}
-
-static int
-devpts_xattr_security_set(struct inode *inode, const char *name,
-			  const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_setsecurity(inode, name, value, size, flags);
-}
-
-struct xattr_handler devpts_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= devpts_xattr_security_list,
-	.get	= devpts_xattr_security_get,
-	.set	= devpts_xattr_security_set,
-};
diff --git a/fs/xattr.c b/fs/xattr.c
index 6acd5c63da9..dc8bc7624f2 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -51,20 +51,29 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		}
 	}
 
+	down(&d->d_inode->i_sem);
+	error = security_inode_setxattr(d, kname, kvalue, size, flags);
+	if (error)
+		goto out;
 	error = -EOPNOTSUPP;
 	if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
-		down(&d->d_inode->i_sem);
-		error = security_inode_setxattr(d, kname, kvalue, size, flags);
-		if (error)
-			goto out;
-		error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags);
+		error = d->d_inode->i_op->setxattr(d, kname, kvalue,
+						   size, flags);
 		if (!error) {
 			fsnotify_xattr(d);
-			security_inode_post_setxattr(d, kname, kvalue, size, flags);
+			security_inode_post_setxattr(d, kname, kvalue,
+						     size, flags);
 		}
-out:
-		up(&d->d_inode->i_sem);
+	} else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
+			    sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
+		error = security_inode_setsecurity(d->d_inode, suffix, kvalue,
+						   size, flags);
+		if (!error)
+			fsnotify_xattr(d);
 	}
+out:
+	up(&d->d_inode->i_sem);
 	if (kvalue)
 		kfree(kvalue);
 	return error;
@@ -139,20 +148,25 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 			return -ENOMEM;
 	}
 
+	error = security_inode_getxattr(d, kname);
+	if (error)
+		goto out;
 	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->getxattr) {
-		error = security_inode_getxattr(d, kname);
-		if (error)
-			goto out;
+	if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
 		error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
-		if (error > 0) {
-			if (size && copy_to_user(value, kvalue, error))
-				error = -EFAULT;
-		} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
-			/* The file system tried to returned a value bigger
-			   than XATTR_SIZE_MAX bytes. Not possible. */
-			error = -E2BIG;
-		}
+	else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
+			  sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
+		error = security_inode_getsecurity(d->d_inode, suffix, kvalue,
+						   size);
+	}
+	if (error > 0) {
+		if (size && copy_to_user(value, kvalue, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+		/* The file system tried to returned a value bigger
+		   than XATTR_SIZE_MAX bytes. Not possible. */
+		error = -E2BIG;
 	}
 out:
 	if (kvalue)
@@ -221,20 +235,24 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 			return -ENOMEM;
 	}
 
+	error = security_inode_listxattr(d);
+	if (error)
+		goto out;
 	error = -EOPNOTSUPP;
 	if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
-		error = security_inode_listxattr(d);
-		if (error)
-			goto out;
 		error = d->d_inode->i_op->listxattr(d, klist, size);
-		if (error > 0) {
-			if (size && copy_to_user(list, klist, error))
-				error = -EFAULT;
-		} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
-			/* The file system tried to returned a list bigger
-			   than XATTR_LIST_MAX bytes. Not possible. */
-			error = -E2BIG;
-		}
+	} else {
+		error = security_inode_listsecurity(d->d_inode, klist, size);
+		if (size && error >= size)
+			error = -ERANGE;
+	}
+	if (error > 0) {
+		if (size && copy_to_user(list, klist, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
+		/* The file system tried to returned a list bigger
+		   than XATTR_LIST_MAX bytes. Not possible. */
+		error = -E2BIG;
 	}
 out:
 	if (klist)
diff --git a/mm/shmem.c b/mm/shmem.c
index 08a3bc2fba6..bdc4bbb6ddb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,6 @@
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
-#include <linux/xattr.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -179,7 +178,6 @@ static struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
-static struct inode_operations shmem_special_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info = {
@@ -1300,7 +1298,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 
 		switch (mode & S_IFMT) {
 		default:
-			inode->i_op = &shmem_special_inode_operations;
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
@@ -1804,12 +1801,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 static struct inode_operations shmem_symlink_inline_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= shmem_follow_link_inline,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static struct inode_operations shmem_symlink_inode_operations = {
@@ -1817,12 +1808,6 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= shmem_follow_link,
 	.put_link	= shmem_put_link,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
@@ -1942,12 +1927,6 @@ static void shmem_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-#ifdef CONFIG_TMPFS_XATTR
-static struct xattr_handler *shmem_xattr_handlers[];
-#else
-#define shmem_xattr_handlers NULL
-#endif
-
 static int shmem_fill_super(struct super_block *sb,
 			    void *data, int silent)
 {
@@ -1998,7 +1977,6 @@ static int shmem_fill_super(struct super_block *sb,
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
-	sb->s_xattr = shmem_xattr_handlers;
 
 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
 	if (!inode)
@@ -2087,12 +2065,6 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
@@ -2106,21 +2078,6 @@ static struct inode_operations shmem_dir_inode_operations = {
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
 	.rename		= shmem_rename,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
-#endif
-};
-
-static struct inode_operations shmem_special_inode_operations = {
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= generic_listxattr,
-	.removexattr	= generic_removexattr,
 #endif
 };
 
@@ -2146,48 +2103,6 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-#ifdef CONFIG_TMPFS_SECURITY
-
-static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len,
-					const char *name, size_t name_len)
-{
-	return security_inode_listsecurity(inode, list, list_len);
-}
-
-static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_getsecurity(inode, name, buffer, size);
-}
-
-static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_setsecurity(inode, name, value, size, flags);
-}
-
-static struct xattr_handler shmem_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= shmem_xattr_security_list,
-	.get	= shmem_xattr_security_get,
-	.set	= shmem_xattr_security_set,
-};
-
-#endif	/* CONFIG_TMPFS_SECURITY */
-
-#ifdef CONFIG_TMPFS_XATTR
-
-static struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_SECURITY
-	&shmem_xattr_security_handler,
-#endif
-	NULL
-};
-
-#endif	/* CONFIG_TMPFS_XATTR */
-
 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-- 
cgit v1.2.3-70-g09d2


From c3d8c1414573be8cf7c8fdc1e076935697c7f6af Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@scalex86.org>
Date: Tue, 6 Sep 2005 15:16:33 -0700
Subject: [PATCH] More __read_mostly variables

Move some more frequently read variables that showed up during some of our
performance tests as sometimes ending up in hot cachelines to the
read_mostly section.

Fix: Move the __read_mostly from before hpet_usec_quotient to follow the
variable like the other uses of __read_mostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c             | 2 +-
 arch/i386/kernel/timers/timer_hpet.c | 2 +-
 mm/mmap.c                            | 2 +-
 mm/page_alloc.c                      | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 294bcca985a..e29fd5aeaf8 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(efi_enabled);
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index 001de97c9e4..6dbb29f834e 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -18,7 +18,7 @@
 #include "mach_timer.h"
 #include <asm/hpet.h>
 
-static unsigned long __read_mostly hpet_usec_quotient;	/* convert hpet clks to usec */
+static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
 static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
diff --git a/mm/mmap.c b/mm/mmap.c
index 404319477e7..bb43340d397 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -61,7 +61,7 @@ pgprot_t protection_map[16] = {
 
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b06a9636d97..34bba8f1144 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -42,11 +42,11 @@
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
-nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map = NODE_MASK_ALL;
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list;
+struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 long nr_swap_pages;
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << ZONETABLE_SHIFT];
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-- 
cgit v1.2.3-70-g09d2


From cdb3826b9958c204bc8ffda2cf9bbe2d899ef90c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 Sep 2005 15:16:57 -0700
Subject: [PATCH] remove misleading comment above sys_brk

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index bb43340d397..12334aecf8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -203,13 +203,6 @@ static void remove_vm_struct(struct vm_area_struct *vma)
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
-/*
- *  sys_brk() for the most part doesn't need the global kernel
- *  lock, except when an application is doing something nasty
- *  like trying to un-brk an area that has already been mapped
- *  to a regular file.  in this case, the unmapping will need
- *  to invoke file system routines that need the global lock.
- */
 asmlinkage unsigned long sys_brk(unsigned long brk)
 {
 	unsigned long rlim, retval;
-- 
cgit v1.2.3-70-g09d2


From 3b30bbd963ac2606b0377b39c9d148d6eeef7dce Mon Sep 17 00:00:00 2001
From: Steven Pratt <slpratt@austin.ibm.com>
Date: Tue, 6 Sep 2005 15:17:06 -0700
Subject: [PATCH] readahead: reset cache_hit earlier

We don't reset the cache hit count until after readahead does a successful
readahead.  This seems to leave a corner case open where we miss in cache,
but don't restart the readhead right away.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index b840e7c6ea7..d0b50034e24 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -540,6 +540,7 @@ void handle_ra_miss(struct address_space *mapping,
 {
 	ra->flags |= RA_FLAG_MISS;
 	ra->flags &= ~RA_FLAG_INCACHE;
+	ra->cache_hit = 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 6c231b7bab0aa6860cd9da2de8a064eddc34c146 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 6 Sep 2005 15:17:45 -0700
Subject: [PATCH] Additions to .data.read_mostly section

Mark variables which are usually accessed for reads with __readmostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c           | 10 +++++-----
 arch/i386/kernel/timers/timer_hpet.c |  2 +-
 arch/i386/mm/discontig.c             |  8 ++++----
 arch/i386/mm/init.c                  |  2 +-
 arch/x86_64/kernel/genapic.c         |  2 +-
 arch/x86_64/kernel/io_apic.c         | 10 +++++-----
 arch/x86_64/kernel/setup.c           |  2 +-
 arch/x86_64/kernel/setup64.c         |  2 +-
 arch/x86_64/kernel/smpboot.c         | 10 +++++-----
 arch/x86_64/mm/numa.c                |  6 +++---
 fs/namespace.c                       |  2 +-
 mm/page_alloc.c                      |  4 ++--
 mm/shmem.c                           |  2 +-
 13 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4a594043157..0e727e6da5c 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -78,7 +78,7 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -1119,7 +1119,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1990,7 +1990,7 @@ static void set_ioapic_affinity_vector (unsigned int vector,
  * edge-triggered handler, without risking IRQ storms and other ugly
  * races.
  */
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename 	= "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -2003,7 +2003,7 @@ static struct hw_interrupt_type ioapic_edge_type = {
 #endif
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename 	= "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -2076,7 +2076,7 @@ static void ack_lapic_irq (unsigned int irq)
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename 	= "local-APIC-edge",
 	.startup 	= NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown 	= NULL, /* shutdown_irq() not used for IRQ0 */
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index 6dbb29f834e..d973a8b681f 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -19,7 +19,7 @@
 #include <asm/hpet.h>
 
 static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
+static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 6711ce3f691..244d8ec66be 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -37,7 +37,7 @@
 #include <asm/mmzone.h>
 #include <bios_ebda.h>
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 bootmem_data_t node0_bdata;
 
@@ -49,8 +49,8 @@ bootmem_data_t node0_bdata;
  * 2) node_start_pfn   - the starting page frame number for a node
  * 3) node_end_pfn     - the ending page fram number for a node
  */
-unsigned long node_start_pfn[MAX_NUMNODES];
-unsigned long node_end_pfn[MAX_NUMNODES];
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -66,7 +66,7 @@ unsigned long node_end_pfn[MAX_NUMNODES];
  *     physnode_map[4-7] = 1;
  *     physnode_map[8- ] = -1;
  */
-s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
 
 void memory_present(int nid, unsigned long start, unsigned long end)
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 9edfc058b89..2ebaf75f732 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -393,7 +393,7 @@ void zap_low_mappings (void)
 }
 
 static int disable_nx __initdata = 0;
-u64 __supported_pte_mask = ~_PAGE_NX;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
 
 /*
  * noexec = on|off
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 30c843a5efd..f062aa03bab 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -25,7 +25,7 @@
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 76bcc4e6979..ba1a744e9bf 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -70,7 +70,7 @@ static struct irq_pin_list {
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -683,7 +683,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1443,7 +1443,7 @@ static void set_ioapic_affinity_vector (unsigned int vector,
  * races.
  */
 
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename = "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -1456,7 +1456,7 @@ static struct hw_interrupt_type ioapic_edge_type = {
 #endif
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename = "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -1529,7 +1529,7 @@ static void ack_lapic_irq (unsigned int irq)
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename = "local-APIC-edge",
 	.startup = NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 116a491e296..b356f8e6adf 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -65,7 +65,7 @@
  * Machine setup..
  */
 
-struct cpuinfo_x86 boot_cpu_data;
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
 
 unsigned long mmu_cr4_features;
 
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 34082c1cc41..e3ffcacc8c9 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -36,7 +36,7 @@ struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
 
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
-unsigned long __supported_pte_mask = ~0UL;
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
 static int do_not_nx __initdata = 0;
 
 /* noexec=on|off
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index fa25e39fe54..90aeccd1519 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -62,13 +62,13 @@
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(phys_proc_id);
 EXPORT_SYMBOL(cpu_core_id);
 
 /* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map;
+cpumask_t cpu_online_map __read_mostly;
 
 EXPORT_SYMBOL(cpu_online_map);
 
@@ -88,8 +88,8 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
-cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
 /*
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 6a156f5692a..04f7a33e144 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -22,14 +22,14 @@
 #define Dprintk(x...)
 #endif
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 int memnode_shift;
 u8  memnodemap[NODEMAPSIZE];
 
-unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
-cpumask_t     node_to_cpumask[MAX_NUMNODES];
+unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+cpumask_t     node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 79bd8a46e1e..29b70669435 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -40,7 +40,7 @@ static inline int sysfs_init(void)
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static struct list_head *mount_hashtable;
-static int hash_mask, hash_bits;
+static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache; 
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34bba8f1144..14d7032c1d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,8 +47,8 @@ EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 struct pglist_data *pgdat_list __read_mostly;
-unsigned long totalram_pages;
-unsigned long totalhigh_pages;
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index bdc4bbb6ddb..db2c9e8d990 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -180,7 +180,7 @@ static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info = {
+static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,
-- 
cgit v1.2.3-70-g09d2


From a49335cceab8afb6603152fcc3f7d3b6677366ca Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:09 -0700
Subject: [PATCH] cpusets: oom_kill tweaks

This patch series extends the use of the cpuset attribute 'mem_exclusive'
to support cpuset configurations that:
 1) allow GFP_KERNEL allocations to come from a potentially larger
    set of memory nodes than GFP_USER allocations, and
 2) can constrain the oom killer to tasks running in cpusets in
    a specified subtree of the cpuset hierarchy.

Here's an example usage scenario.  For a few hours or more, a large NUMA
system at a University is to be divided in two halves, with a bunch of student
jobs running in half the system under some form of batch manager, and with a
big research project running in the other half.  Each of the student jobs is
placed in a small cpuset, but should share the classic Unix time share
facilities, such as buffered pages of files in /bin and /usr/lib.  The big
research project wants no interference whatsoever from the student jobs, and
has highly tuned, unusual memory and i/o patterns that intend to make full use
of all the main memory on the nodes available to it.

In this example, we have two big sibling cpusets, one of which is further
divided into a more dynamic set of child cpusets.

We want kernel memory allocations constrained by the two big cpusets, and user
allocations constrained by the smaller child cpusets where present.  And we
require that the oom killer not operate across the two halves of this system,
or else the first time a student job runs amuck, the big research project will
likely be first inline to get shot.

Tweaking /proc/<pid>/oom_adj is not ideal -- if the big research project
really does run amuck allocating memory, it should be shot, not some other
task outside the research projects mem_exclusive cpuset.

I propose to extend the use of the 'mem_exclusive' flag of cpusets to manage
such scenarios.  Let memory allocations for user space (GFP_USER) be
constrained by a tasks current cpuset, but memory allocations for kernel space
(GFP_KERNEL) by constrained by the nearest mem_exclusive ancestor of the
current cpuset, even though kernel space allocations will still _prefer_ to
remain within the current tasks cpuset, if memory is easily available.

Let the oom killer be constrained to consider only tasks that are in
overlapping mem_exclusive cpusets (it won't help much to kill a task that
normally cannot allocate memory on any of the same nodes as the ones on which
the current task can allocate.)

The current constraints imposed on setting mem_exclusive are unchanged.  A
cpuset may only be mem_exclusive if its parent is also mem_exclusive, and a
mem_exclusive cpuset may not overlap any of its siblings memory nodes.

This patch was presented on linux-mm in early July 2005, though did not
generate much feedback at that time.  It has been built for a variety of
arch's using cross tools, and built, booted and tested for function on SN2
(ia64).

There are 4 patches in this set:
  1) Some minor cleanup, and some improvements to the code layout
     of one routine to make subsequent patches cleaner.
  2) Add another GFP flag - __GFP_HARDWALL.  It marks memory
     requests for USER space, which are tightly confined by the
     current tasks cpuset.
  3) Now memory requests (such as KERNEL) that not marked HARDWALL can
     if short on memory, look in the potentially larger pool of memory
     defined by the nearest mem_exclusive ancestor cpuset of the current
     tasks cpuset.
  4) Finally, modify the oom killer to skip any task whose mem_exclusive
     cpuset doesn't overlap ours.

Patch (1), the one time I looked on an SN2 (ia64) build, actually saved 32
bytes of kernel text space.  Patch (2) has no affect on the size of kernel
text space (it just adds a preprocessor flag).  Patches (3) and (4) added
about 600 bytes each of kernel text space, mostly in kernel/cpuset.c, which
matters only if CONFIG_CPUSET is enabled.

This patch:

This patch applies a few comment and code cleanups to mm/oom_kill.c prior to
applying a few small patches to improve cpuset management of memory placement.

The comment changed in oom_kill.c was seriously misleading.  The code layout
change in select_bad_process() makes room for adding another condition on
which a process can be spared the oom killer (see the subsequent
cpuset_nodes_overlap patch for this addition).

Also a couple typos and spellos that bugged me, while I was here.

This patch should have no material affect.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 57 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e56076672f..3a1d4650293 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -6,8 +6,8 @@
  *	for goading me into coding this file...
  *
  *  The routines in this file are used to kill a process when
- *  we're seriously out of memory. This gets called from kswapd()
- *  in linux/mm/vmscan.c when we really run out of memory.
+ *  we're seriously out of memory. This gets called from __alloc_pages()
+ *  in mm/page_alloc.c when we really run out of memory.
  *
  *  Since we won't call these routines often (on a well-configured
  *  machine) this file will double as a 'coding guide' and a signpost
@@ -26,7 +26,7 @@
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
- * @p: current uptime in seconds
+ * @uptime: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -57,9 +57,9 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 
 	/*
 	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add the vmsize of the childs if they
+	 * a good choice. We add the vmsize of the children if they
 	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of childs
+	 * machine with an endless amount of children
 	 */
 	list_for_each(tsk, &p->children) {
 		struct task_struct *chld;
@@ -143,28 +143,32 @@ static struct task_struct * select_bad_process(void)
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
-	do_each_thread(g, p)
+	do_each_thread(g, p) {
+		unsigned long points;
+		int releasing;
+
 		/* skip the init task with pid == 1 */
-		if (p->pid > 1 && p->oomkilladj != OOM_DISABLE) {
-			unsigned long points;
-
-			/*
-			 * This is in the process of releasing memory so wait it
-			 * to finish before killing some other task by mistake.
-			 */
-			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
-			    !(p->flags & PF_DEAD))
-				return ERR_PTR(-1UL);
-			if (p->flags & PF_SWAPOFF)
-				return p;
-
-			points = badness(p, uptime.tv_sec);
-			if (points > maxpoints || !chosen) {
-				chosen = p;
-				maxpoints = points;
-			}
+		if (p->pid == 1)
+			continue;
+		if (p->oomkilladj == OOM_DISABLE)
+			continue;
+		/*
+		 * This is in the process of releasing memory so for wait it
+		 * to finish before killing some other task by mistake.
+		 */
+		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
+						p->flags & PF_EXITING;
+		if (releasing && !(p->flags & PF_DEAD))
+			return ERR_PTR(-1UL);
+		if (p->flags & PF_SWAPOFF)
+			return p;
+
+		points = badness(p, uptime.tv_sec);
+		if (points > maxpoints || !chosen) {
+			chosen = p;
+			maxpoints = points;
 		}
-	while_each_thread(g, p);
+	} while_each_thread(g, p);
 	return chosen;
 }
 
@@ -189,7 +193,8 @@ static void __oom_kill_task(task_t *p)
 		return;
 	}
 	task_unlock(p);
-	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
+							p->pid, p->comm);
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
-- 
cgit v1.2.3-70-g09d2


From 9bf2229f8817677127a60c177aefce1badd22d7b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:12 -0700
Subject: [PATCH] cpusets: formalize intermediate GFP_KERNEL containment

This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution.  With this patch, there are now the following four layers of
memory placement available:

 1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
 2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
 3) The current tasks cpuset (GFP_USER allocations constrained to here), and
 4) Specific node placement, using mbind and set_mempolicy.

These nest - each layer is a subset (same or within) of the previous.

Layer (2) above is new, with this patch.  The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed.  The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.

GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.

The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such.  Swapper and oom_kill activity is also constrained to Layer (2).  A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset.  Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.

This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.

Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 12 +++++++
 include/linux/cpuset.h    |  5 +--
 kernel/cpuset.c           | 80 ++++++++++++++++++++++++++++++++++++++++++-----
 mm/page_alloc.c           | 16 ++++++----
 mm/vmscan.c               |  8 ++---
 5 files changed, 101 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ad944c06031..47f4114fbf5 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -60,6 +60,18 @@ all of the cpus in the system. This removes any overhead due to
 load balancing code trying to pull tasks outside of the cpu exclusive
 cpuset only to be prevented by the tasks' cpus_allowed mask.
 
+A cpuset that is mem_exclusive restricts kernel allocations for
+page, buffer and other data commonly shared by the kernel across
+multiple users.  All cpusets, whether mem_exclusive or not, restrict
+allocations of memory for user space.  This enables configuring a
+system so that several independent jobs can share common kernel
+data, such as file system pages, while isolating each jobs user
+allocation in its own cpuset.  To do this, construct a large
+mem_exclusive cpuset to hold all the jobs, and construct child,
+non-mem_exclusive cpusets for each individual job.  Only a small
+amount of typical kernel memory, such as requests from interrupt
+handlers, is allowed to be taken outside even a mem_exclusive cpuset.
+
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
 cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3438233305a..1fe1c3ebad3 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-int cpuset_zone_allowed(struct zone *z);
+extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -48,7 +48,8 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 1;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z)
+static inline int cpuset_zone_allowed(struct zone *z,
+					unsigned int __nocast gfp_mask)
 {
 	return 1;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b..214806deca9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1611,17 +1611,81 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 0;
 }
 
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+	while (!is_mem_exclusive(cs) && cs->parent)
+		cs = cs->parent;
+	return cs;
+}
+
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
- * @z: zone in question
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
  *
- * Is zone z allowed in current->mems_allowed, or is
- * the CPU in interrupt context? (zone is always allowed in this case)
- */
-int cpuset_zone_allowed(struct zone *z)
+ * If we're in interrupt, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *	in_interrupt - any node ok (current task context irrelevant)
+ *	GFP_ATOMIC   - any node ok
+ *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
 {
-	return in_interrupt() ||
-		node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+	int node;			/* node that zone z is on */
+	const struct cpuset *cs;	/* current cpuset ancestors */
+	int allowed = 1;		/* is allocation in zone z allowed? */
+
+	if (in_interrupt())
+		return 1;
+	node = z->zone_pgdat->node_id;
+	if (node_isset(node, current->mems_allowed))
+		return 1;
+	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
+		return 0;
+
+	/* Not hardwall and node outside mems_allowed: scan up cpusets */
+	down(&cpuset_sem);
+	cs = current->cpuset;
+	if (!cs)
+		goto done;		/* current task exiting */
+	cs = nearest_exclusive_ancestor(cs);
+	allowed = node_isset(node, cs->mems_allowed);
+done:
+	up(&cpuset_sem);
+	return allowed;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d7032c1d1..3974fd81d27 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 	classzone_idx = zone_idx(zones[0]);
 
 restart:
-	/* Go through the zonelist once, looking for a zone with enough free */
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
-		if (!cpuset_zone_allowed(z))
+		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
 
 		/*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
 				       gfp_mask & __GFP_HIGH))
 			continue;
 
-		if (wait && !cpuset_zone_allowed(z))
+		if (wait && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
 			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z))
+				if (!cpuset_zone_allowed(z, gfp_mask))
 					continue;
 				page = buffered_rmqueue(z, order, gfp_mask);
 				if (page)
@@ -903,7 +907,7 @@ rebalance:
 					       gfp_mask & __GFP_HIGH))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, gfp_mask))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
 					       classzone_idx, 0, 0))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde..a740778f688 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->present_pages == 0)
 			continue;
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
-	if (!cpuset_zone_allowed(zone))
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
 		return;
-- 
cgit v1.2.3-70-g09d2


From ef08e3b4981aebf2ba9bd7025ef7210e8eec07ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:13 -0700
Subject: [PATCH] cpusets: confine oom_killer to mem_exclusive cpuset

Now the real motivation for this cpuset mem_exclusive patch series seems
trivial.

This patch keeps a task in or under one mem_exclusive cpuset from provoking an
oom kill of a task under a non-overlapping mem_exclusive cpuset.  Since only
interrupt and GFP_ATOMIC allocations are allowed to escape mem_exclusive
containment, there is little to gain from oom killing a task under a
non-overlapping mem_exclusive cpuset, as almost all kernel and user memory
allocation must come from disjoint memory nodes.

This patch enables configuring a system so that a runaway job under one
mem_exclusive cpuset cannot cause the killing of a job in another such cpuset
that might be using very high compute and memory resources for a prolonged
time.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  6 ++++++
 kernel/cpuset.c        | 33 +++++++++++++++++++++++++++++++++
 mm/oom_kill.c          |  5 +++++
 3 files changed, 44 insertions(+)

(limited to 'mm')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1fe1c3ebad3..24062a1dbf6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -24,6 +24,7 @@ void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
+extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -54,6 +55,11 @@ static inline int cpuset_zone_allowed(struct zone *z,
 	return 1;
 }
 
+static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+	return 1;
+}
+
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 214806deca9..40c6d801dd6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1688,6 +1688,39 @@ done:
 	return allowed;
 }
 
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
+	int overlap = 0;		/* do cpusets overlap? */
+
+	down(&cpuset_sem);
+	cs1 = current->cpuset;
+	if (!cs1)
+		goto done;		/* current task exiting */
+	cs2 = p->cpuset;
+	if (!cs2)
+		goto done;		/* task p is exiting */
+	cs1 = nearest_exclusive_ancestor(cs1);
+	cs2 = nearest_exclusive_ancestor(cs2);
+	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+	up(&cpuset_sem);
+
+	return overlap;
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a1d4650293..5ec8da12cfd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/cpuset.h>
 
 /* #define DEBUG */
 
@@ -152,6 +153,10 @@ static struct task_struct * select_bad_process(void)
 			continue;
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
+		/* If p's nodes don't overlap ours, it won't help to kill p. */
+		if (!cpuset_excl_nodes_overlap(p))
+			continue;
+
 		/*
 		 * This is in the process of releasing memory so for wait it
 		 * to finish before killing some other task by mistake.
-- 
cgit v1.2.3-70-g09d2


From dd3927105b6f65afb7dac17682172cdfb86d3f00 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.Helsinki.FI>
Date: Tue, 6 Sep 2005 15:18:31 -0700
Subject: [PATCH] introduce and use kzalloc

This patch introduces a kzalloc wrapper and converts kernel/ to use it.  It
saves a little program text.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 16 +++++++++++++++-
 kernel/intermodule.c |  3 +--
 kernel/params.c      |  4 ++--
 kernel/power/pm.c    |  3 +--
 kernel/resource.c    |  3 +--
 kernel/workqueue.c   |  3 +--
 mm/slab.c            | 18 ++++++------------
 7 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 80b2dfde2e8..42a6bea58af 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -99,7 +99,21 @@ found:
 	return __kmalloc(size, flags);
 }
 
-extern void *kcalloc(size_t, size_t, unsigned int __nocast);
+extern void *kzalloc(size_t, unsigned int __nocast);
+
+/**
+ * kcalloc - allocate memory for an array. The memory is set to zero.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate.
+ */
+static inline void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+{
+	if (n != 0 && size > INT_MAX / n)
+		return NULL;
+	return kzalloc(n * size, flags);
+}
+
 extern void kfree(const void *);
 extern unsigned int ksize(const void *);
 
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b..0cbe633420f 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 	struct list_head *tmp;
 	struct inter_module_entry *ime, *ime_new;
 
-	if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+	if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
 		/* Overloaded kernel, not fatal */
 		printk(KERN_ERR
 			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 		kmalloc_failed = 1;
 		return;
 	}
-	memset(ime_new, 0, sizeof(*ime_new));
 	ime_new->im_name = im_name;
 	ime_new->owner = owner;
 	ime_new->userdata = userdata;
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8f..fbf173215fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
 	struct module_kobject *mk;
 
-	mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
-	memset(mk, 0, sizeof(struct module_kobject));
+	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
 	kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39..159149321b3 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
 			   unsigned long id,
 			   pm_callback callback)
 {
-	struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
 	if (dev) {
-		memset(dev, 0, sizeof(*dev));
 		dev->type = type;
 		dev->id = id;
 		dev->callback = callback;
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e04220..92285d822de 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
  */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-	struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
 	if (res) {
-		memset(res, 0, sizeof(*res));
 		res->name = name;
 		res->start = start;
 		res->end = start + n - 1;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3de837a8dd..91bacb13a7e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	struct workqueue_struct *wq;
 	struct task_struct *p;
 
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return NULL;
-	memset(wq, 0, sizeof(*wq));
 
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
diff --git a/mm/slab.c b/mm/slab.c
index a9ff4f7f986..d7c4443991f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2558,24 +2558,18 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kcalloc - allocate memory for an array. The memory is set to zero.
- * @n: number of elements.
- * @size: element size.
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, unsigned int __nocast flags)
 {
-	void *ret = NULL;
-
-	if (n != 0 && size > INT_MAX / n)
-		return ret;
-
-	ret = kmalloc(n * size, flags);
+	void *ret = kmalloc(size, flags);
 	if (ret)
-		memset(ret, 0, n * size);
+		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kcalloc);
+EXPORT_SYMBOL(kzalloc);
 
 /**
  * kfree - free previously allocated memory
-- 
cgit v1.2.3-70-g09d2