From ddd588b5dd55f14320379961e47683db4e4c1d90 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 22 Mar 2011 16:30:46 -0700
Subject: oom: suppress nodes that are not allowed from meminfo on oom kill

The oom killer is extremely verbose for machines with a large number of
cpus and/or nodes.  This verbosity can often be harmful if it causes other
important messages to be scrolled from the kernel log and incurs a
signicant time delay, specifically for kernels with CONFIG_NODES_SHIFT >
8.

This patch causes only memory information to be displayed for nodes that
are allowed by current's cpuset when dumping the VM state.  Information
for all other nodes is irrelevant to the oom condition; we don't care if
there's an abundance of memory elsewhere if we can't access it.

This only affects the behavior of dumping memory information when an oom
is triggered.  Other dumps, such as for sysrq+m, still display the
unfiltered form when using the existing show_mem() interface.

Additionally, the per-cpu pageset statistics are extremely verbose in oom
killer output, so it is now suppressed.  This removes

	nodes_weight(current->mems_allowed) * (1 + nr_cpus)

lines from the oom killer output.

Callers may use __show_mem(SHOW_MEM_FILTER_NODES) to filter disallowed
nodes.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7945247b1e5..36be3ba4bbe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2411,19 +2411,42 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 }
 #endif
 
+/*
+ * Determine whether the zone's node should be displayed or not, depending on
+ * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
+ */
+static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
+{
+	bool ret = false;
+
+	if (!(flags & SHOW_MEM_FILTER_NODES))
+		goto out;
+
+	get_mems_allowed();
+	ret = !node_isset(zone->zone_pgdat->node_id,
+				cpuset_current_mems_allowed);
+	put_mems_allowed();
+out:
+	return ret;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
  */
-void show_free_areas(void)
+void __show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 
 	for_each_populated_zone(zone) {
+		if (skip_free_areas_zone(filter, zone))
+			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 
@@ -2465,6 +2488,8 @@ void show_free_areas(void)
 	for_each_populated_zone(zone) {
 		int i;
 
+		if (skip_free_areas_zone(filter, zone))
+			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -2532,6 +2557,8 @@ void show_free_areas(void)
 	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
+		if (skip_free_areas_zone(filter, zone))
+			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 
@@ -2551,6 +2578,11 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
+void show_free_areas(void)
+{
+	__show_free_areas(0);
+}
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
-- 
cgit v1.2.3-70-g09d2


From 29423e77c06cee7d4e335ef4a7cbd949da978c91 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 22 Mar 2011 16:30:47 -0700
Subject: oom: suppress show_mem() for many nodes in irq context on page alloc
 failure

When a page allocation failure occurs, show_mem() is called to dump the
state of the VM so users may understand what happened to get into that
condition.

This output, however, can be extremely verbose.  In irq context, it may
result in significant delays that incur NMI watchdog timeouts when the
machine is large (we use CONFIG_NODES_SHIFT > 8 here to define a "large"
machine since the length of the show_mem() output is proportional to the
number of possible nodes).

This patch suppresses the show_mem() call in irq context when the kernel
has CONFIG_NODES_SHIFT > 8.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36be3ba4bbe..2aaafe82f51 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1714,6 +1714,20 @@ try_next_zone:
 	return page;
 }
 
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+	bool ret = false;
+
+#if NODES_SHIFT > 8
+	ret = in_interrupt();
+#endif
+	return ret;
+}
+
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long pages_reclaimed)
@@ -2161,7 +2175,8 @@ nopage:
 			" order:%d, mode:0x%x\n",
 			current->comm, order, gfp_mask);
 		dump_stack();
-		show_mem();
+		if (!should_suppress_show_mem())
+			show_mem();
 	}
 	return page;
 got_pg:
-- 
cgit v1.2.3-70-g09d2


From cbf978bfb12d7deca97d7333f65eda0381a072de Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 22 Mar 2011 16:30:48 -0700
Subject: oom: suppress nodes that are not allowed from meminfo on page alloc
 failure

Displaying extremely verbose meminfo for all nodes on the system is
overkill for page allocation failures when the context restricts that
allocation to only a subset of nodes.  We don't particularly care about
the state of all nodes when some are not allowed in the current context,
they can have an abundance of memory but we can't allocate from that part
of memory.

This patch suppresses disallowed nodes from the meminfo dump on a page
allocation failure if the context requires it.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2aaafe82f51..36a168e383b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2171,12 +2171,25 @@ rebalance:
 
 nopage:
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
-		printk(KERN_WARNING "%s: page allocation failure."
-			" order:%d, mode:0x%x\n",
+		unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+		/*
+		 * This documents exceptions given to allocations in certain
+		 * contexts that are allowed to allocate outside current's set
+		 * of allowed nodes.
+		 */
+		if (!(gfp_mask & __GFP_NOMEMALLOC))
+			if (test_thread_flag(TIF_MEMDIE) ||
+			    (current->flags & (PF_MEMALLOC | PF_EXITING)))
+				filter &= ~SHOW_MEM_FILTER_NODES;
+		if (in_interrupt() || !wait)
+			filter &= ~SHOW_MEM_FILTER_NODES;
+
+		pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
 			current->comm, order, gfp_mask);
 		dump_stack();
 		if (!should_suppress_show_mem())
-			show_mem();
+			__show_mem(filter);
 	}
 	return page;
 got_pg:
-- 
cgit v1.2.3-70-g09d2


From 1d16871d8c96deadc5f9753b6b096074f2cbcbe1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 22 Mar 2011 16:32:45 -0700
Subject: mm: batch-free pcp list if possible

free_pcppages_bulk() frees pages from pcp lists in a round-robin fashion
by keeping batch_free counter.  But it doesn't need to spin if there is
only one non-empty list.  This can be checked by batch_free ==
MIGRATE_PCPTYPES.

[akpm@linux-foundation.org: fix comment]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36a168e383b..426056aff12 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -614,6 +614,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 
+		/* This is the only non-empty list. Free them all. */
+		if (batch_free == MIGRATE_PCPTYPES)
+			batch_free = to_free;
+
 		do {
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
-- 
cgit v1.2.3-70-g09d2


From 11bc82d67d1150767901bca54a24466621d763d7 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Mar 2011 16:33:11 -0700
Subject: mm: compaction: Use async migration for __GFP_NO_KSWAPD and enforce
 no writeback

__GFP_NO_KSWAPD allocations are usually very expensive and not mandatory
to succeed as they have graceful fallback.  Waiting for I/O in those,
tends to be overkill in terms of latencies, so we can reduce their latency
by disabling sync migrate.

Unfortunately, even with async migration it's still possible for the
process to be blocked waiting for a request slot (e.g.  get_request_wait
in the block layer) when ->writepage is called.  To prevent
__GFP_NO_KSWAPD blocking, this patch prevents ->writepage being called on
dirty page cache for asynchronous migration.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=31142

[mel@csn.ul.ie: Avoid writebacks for NFS, retry locked pages, use bool]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Arthur Marsh <arthur.marsh@internode.on.net>
Cc: Clemens Ladisch <cladisch@googlemail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Reported-by: Alex Villacis Lasso <avillaci@ceibo.fiec.espol.edu.ec>
Tested-by: Alex Villacis Lasso <avillaci@ceibo.fiec.espol.edu.ec>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c    | 48 +++++++++++++++++++++++++++++++++---------------
 mm/page_alloc.c |  2 +-
 2 files changed, 34 insertions(+), 16 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/migrate.c b/mm/migrate.c
index 7d2983f3783..89e5c3fe8bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  *  == 0 - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page,
-						int remap_swapcache)
+					int remap_swapcache, bool sync)
 {
 	struct address_space *mapping;
 	int rc;
@@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 	mapping = page_mapping(page);
 	if (!mapping)
 		rc = migrate_page(mapping, newpage, page);
-	else if (mapping->a_ops->migratepage)
+	else {
 		/*
-		 * Most pages have a mapping and most filesystems
-		 * should provide a migration function. Anonymous
-		 * pages are part of swap space which also has its
-		 * own migration function. This is the most common
-		 * path for page migration.
+		 * Do not writeback pages if !sync and migratepage is
+		 * not pointing to migrate_page() which is nonblocking
+		 * (swapcache/tmpfs uses migratepage = migrate_page).
 		 */
-		rc = mapping->a_ops->migratepage(mapping,
-						newpage, page);
-	else
-		rc = fallback_migrate_page(mapping, newpage, page);
+		if (PageDirty(page) && !sync &&
+		    mapping->a_ops->migratepage != migrate_page)
+			rc = -EBUSY;
+		else if (mapping->a_ops->migratepage)
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(mapping,
+							newpage, page);
+		else
+			rc = fallback_migrate_page(mapping, newpage, page);
+	}
 
 	if (rc) {
 		newpage->mapping = NULL;
@@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	rc = -EAGAIN;
 
 	if (!trylock_page(page)) {
-		if (!force)
+		if (!force || !sync)
 			goto move_newpage;
 
 		/*
@@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	BUG_ON(charge);
 
 	if (PageWriteback(page)) {
-		if (!force || !sync)
+		/*
+		 * For !sync, there is no point retrying as the retry loop
+		 * is expected to be too short for PageWriteback to be cleared
+		 */
+		if (!sync) {
+			rc = -EBUSY;
+			goto uncharge;
+		}
+		if (!force)
 			goto uncharge;
 		wait_on_page_writeback(page);
 	}
@@ -757,7 +775,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 
 skip_unmap:
 	if (!page_mapped(page))
-		rc = move_to_new_page(newpage, page, remap_swapcache);
+		rc = move_to_new_page(newpage, page, remap_swapcache, sync);
 
 	if (rc && remap_swapcache)
 		remove_migration_ptes(page, page);
@@ -850,7 +868,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 
 	if (!page_mapped(hpage))
-		rc = move_to_new_page(new_hpage, hpage, 1);
+		rc = move_to_new_page(new_hpage, hpage, 1, sync);
 
 	if (rc)
 		remove_migration_ptes(hpage, hpage);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 426056aff12..6d0032bdb5d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2103,7 +2103,7 @@ rebalance:
 					sync_migration);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
-- 
cgit v1.2.3-70-g09d2


From 78afd5612deb8268bafc8b6507d72341d5ed9aac Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 22 Mar 2011 16:33:12 -0700
Subject: mm: add __GFP_OTHER_NODE flag

Add a new __GFP_OTHER_NODE flag to tell the low level numa statistics in
zone_statistics() that an allocation is on behalf of another thread.  This
way the local and remote counters can be still correct, even when
background daemons like khugepaged are changing memory mappings.

This only affects the accounting, but I think it's worth doing that right
to avoid confusing users.

I first tried to just pass down the right node, but this required a lot of
changes to pass down this parameter and at least one addition of a 10th
argument to a 9 argument function.  Using the flag is a lot less
intrusive.

Open: should be also used for migration?

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 2 ++
 include/linux/vmstat.h | 4 ++--
 mm/page_alloc.c        | 2 +-
 mm/vmstat.c            | 9 +++++++--
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dca31761b31..bfb8f934521 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -35,6 +35,7 @@ struct vm_area_struct;
 #define ___GFP_NOTRACK		0
 #endif
 #define ___GFP_NO_KSWAPD	0x400000u
+#define ___GFP_OTHER_NODE	0x800000u
 
 /*
  * GFP bitmasks..
@@ -83,6 +84,7 @@ struct vm_area_struct;
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
+#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 833e676d6d9..461c0119664 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -220,12 +220,12 @@ static inline unsigned long node_page_state(int node,
 		zone_page_state(&zones[ZONE_MOVABLE], item);
 }
 
-extern void zone_statistics(struct zone *, struct zone *);
+extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
 
 #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl,_z) do { } while (0)
+#define zone_statistics(_zl, _z, gfp) do { } while (0)
 
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d0032bdb5d..136a547262a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1337,7 +1337,7 @@ again:
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
-	zone_statistics(preferred_zone, zone);
+	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 
 	VM_BUG_ON(bad_range(zone, page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b5048773..772b39b87d9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -500,8 +500,12 @@ void refresh_cpu_vm_stats(int cpu)
  * z 	    = the zone from which the allocation occurred.
  *
  * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
  */
-void zone_statistics(struct zone *preferred_zone, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 {
 	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
 		__inc_zone_state(z, NUMA_HIT);
@@ -509,7 +513,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 		__inc_zone_state(z, NUMA_MISS);
 		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
 	}
-	if (z->node == numa_node_id())
+	if (z->node == ((flags & __GFP_OTHER_NODE) ?
+			preferred_zone->node : numa_node_id()))
 		__inc_zone_state(z, NUMA_LOCAL);
 	else
 		__inc_zone_state(z, NUMA_OTHER);
-- 
cgit v1.2.3-70-g09d2


From 84be48d84a53044e13aa8816aab201ab5480815d Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Tue, 22 Mar 2011 16:33:41 -0700
Subject: mm/page_alloc.c: use list_move() instead of list_del()/list_add()
 combination

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 136a547262a..3a58221f4c2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -867,9 +867,8 @@ static int move_freepages(struct zone *zone,
 		}
 
 		order = page_order(page);
-		list_del(&page->lru);
-		list_add(&page->lru,
-			&zone->free_area[order].free_list[migratetype]);
+		list_move(&page->lru,
+			  &zone->free_area[order].free_list[migratetype]);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
-- 
cgit v1.2.3-70-g09d2


From f212ad7cf9c73f8a7fa160e223dcb3f074441a72 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Wed, 23 Mar 2011 16:42:25 -0700
Subject: memcg: add memcg sanity checks at allocating and freeing pages

Add checks at allocating or freeing a page whether the page is used (iow,
charged) from the view point of memcg.

This check may be useful in debugging a problem and we did similar checks
before the commit 52d4b9ac(memcg: allocate all page_cgroup at boot).

This patch adds some overheads at allocating or freeing memory, so it's
enabled only when CONFIG_DEBUG_VM is enabled.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 17 +++++++++++++++++
 mm/memcontrol.c            | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c            |  8 ++++++--
 3 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5bb7be2628c..5a5ce705583 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -151,6 +151,10 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail);
 #endif
 
+#ifdef CONFIG_DEBUG_VM
+bool mem_cgroup_bad_page_check(struct page *page);
+void mem_cgroup_print_bad_page(struct page *page);
+#endif
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -352,5 +356,18 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head,
 
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
+#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
+static inline bool
+mem_cgroup_bad_page_check(struct page *page)
+{
+	return false;
+}
+
+static inline void
+mem_cgroup_print_bad_page(struct page *page)
+{
+}
+#endif
+
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3a2d54bdf07..0356cb6c950 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3046,6 +3046,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_VM
+static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
+{
+	struct page_cgroup *pc;
+
+	pc = lookup_page_cgroup(page);
+	if (likely(pc) && PageCgroupUsed(pc))
+		return pc;
+	return NULL;
+}
+
+bool mem_cgroup_bad_page_check(struct page *page)
+{
+	if (mem_cgroup_disabled())
+		return false;
+
+	return lookup_page_cgroup_used(page) != NULL;
+}
+
+void mem_cgroup_print_bad_page(struct page *page)
+{
+	struct page_cgroup *pc;
+
+	pc = lookup_page_cgroup_used(page);
+	if (pc) {
+		int ret = -1;
+		char *path;
+
+		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
+		       pc, pc->flags, pc->mem_cgroup);
+
+		path = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (path) {
+			rcu_read_lock();
+			ret = cgroup_path(pc->mem_cgroup->css.cgroup,
+							path, PATH_MAX);
+			rcu_read_unlock();
+		}
+
+		printk(KERN_CONT "(%s)\n",
+				(ret < 0) ? "cannot get the path" : path);
+		kfree(path);
+	}
+}
+#endif
+
 static DEFINE_MUTEX(set_limit_mutex);
 
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3a58221f4c2..8e5726ab0d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
-		(page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
@@ -754,7 +756,8 @@ static inline int check_new_page(struct page *page)
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
-		(page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
@@ -5684,4 +5687,5 @@ void dump_page(struct page *page)
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);
+	mem_cgroup_print_bad_page(page);
 }
-- 
cgit v1.2.3-70-g09d2


From b2b755b5f10eb32fbdc73a9907c07006b17f714b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 24 Mar 2011 15:18:15 -0700
Subject: lib, arch: add filter argument to show_mem and fix private
 implementations

Commit ddd588b5dd55 ("oom: suppress nodes that are not allowed from
meminfo on oom kill") moved lib/show_mem.o out of lib/lib.a, which
resulted in build warnings on all architectures that implement their own
versions of show_mem():

	lib/lib.a(show_mem.o): In function `show_mem':
	show_mem.c:(.text+0x1f4): multiple definition of `show_mem'
	arch/sparc/mm/built-in.o:(.text+0xd70): first defined here

The fix is to remove __show_mem() and add its argument to show_mem() in
all implementations to prevent this breakage.

Architectures that implement their own show_mem() actually don't do
anything with the argument yet, but they could be made to filter nodes
that aren't allowed in the current context in the future just like the
generic implementation.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: James Bottomley <James.Bottomley@hansenpartnership.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/init.c        | 2 +-
 arch/ia64/mm/contig.c     | 2 +-
 arch/ia64/mm/discontig.c  | 2 +-
 arch/parisc/mm/init.c     | 2 +-
 arch/powerpc/xmon/xmon.c  | 2 +-
 arch/sparc/mm/init_32.c   | 2 +-
 arch/tile/mm/pgtable.c    | 2 +-
 arch/unicore32/mm/init.c  | 2 +-
 drivers/tty/sysrq.c       | 2 +-
 drivers/tty/vt/keyboard.c | 2 +-
 include/linux/mm.h        | 5 ++---
 lib/show_mem.c            | 7 +------
 mm/oom_kill.c             | 2 +-
 mm/page_alloc.c           | 2 +-
 14 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index b3b0f0f5053..e5f6fc42834 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -78,7 +78,7 @@ __tagtable(ATAG_INITRD2, parse_tag_initrd2);
  */
 struct meminfo meminfo;
 
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	int free = 0, total = 0, reserved = 0;
 	int shared = 0, cached = 0, slab = 0, i;
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 54bf5405981..9a018cde5d8 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -36,7 +36,7 @@ static unsigned long max_gap;
  * Shows a simple page count of reserved and used pages in the system.
  * For discontig machines, it does this on a per-pgdat basis.
  */
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	int i, total_reserved = 0;
 	int total_shared = 0, total_cached = 0;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 61620323bb6..82ab1bc6afb 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -614,7 +614,7 @@ void __cpuinit *per_cpu_init(void)
  * Shows a simple page count of reserved and used pages in the system.
  * For discontig machines, it does this on a per-pgdat basis.
  */
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	int i, total_reserved = 0;
 	int total_shared = 0, total_cached = 0;
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index f4f4d700833..b7ed8d7a9b3 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -544,7 +544,7 @@ void __init mem_init(void)
 unsigned long *empty_zero_page __read_mostly;
 EXPORT_SYMBOL(empty_zero_page);
 
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	int i,free = 0,total = 0,reserved = 0;
 	int shared = 0, cached = 0;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d17d04cfb2c..33794c1d92c 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -821,7 +821,7 @@ cmds(struct pt_regs *excp)
 				memzcan();
 				break;
 			case 'i':
-				show_mem();
+				show_mem(0);
 				break;
 			default:
 				termch = cmd;
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 6d0e02c4fe0..4c31e2b6e71 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -75,7 +75,7 @@ void __init kmap_init(void)
 	kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE);
 }
 
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	printk("Mem-info:\n");
 	show_free_areas();
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 1a2b36f8866..de7d8e21e01 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -41,7 +41,7 @@
  * The normal show_free_areas() is too verbose on Tile, with dozens
  * of processors and often four NUMA zones each with high and lowmem.
  */
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	struct zone *zone;
 
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 3dbe3709b69..1fc02633f70 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -55,7 +55,7 @@ early_param("initrd", early_initrd);
  */
 struct meminfo meminfo;
 
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	int free = 0, total = 0, reserved = 0;
 	int shared = 0, cached = 0, slab = 0, i;
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 81f13958e75..43db715f150 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -306,7 +306,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
 
 static void sysrq_handle_showmem(int key)
 {
-	show_mem();
+	show_mem(0);
 }
 static struct sysrq_key_op sysrq_showmem_op = {
 	.handler	= sysrq_handle_showmem,
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 6dd3c68c13a..d6b342b5b42 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -600,7 +600,7 @@ static void fn_scroll_back(struct vc_data *vc)
 
 static void fn_show_mem(struct vc_data *vc)
 {
-	show_mem();
+	show_mem(0);
 }
 
 static void fn_show_state(struct vc_data *vc)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f9535b2c955..7606d7db96c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -861,7 +861,7 @@ extern void pagefault_out_of_memory(void);
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
 /*
- * Flags passed to __show_mem() and __show_free_areas() to suppress output in
+ * Flags passed to show_mem() and __show_free_areas() to suppress output in
  * various contexts.
  */
 #define SHOW_MEM_FILTER_NODES	(0x0001u)	/* filter disallowed nodes */
@@ -1360,8 +1360,7 @@ extern void setup_per_zone_wmarks(void);
 extern void calculate_zone_inactive_ratio(struct zone *zone);
 extern void mem_init(void);
 extern void __init mmap_init(void);
-extern void show_mem(void);
-extern void __show_mem(unsigned int flags);
+extern void show_mem(unsigned int flags);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern int after_bootmem;
diff --git a/lib/show_mem.c b/lib/show_mem.c
index d8d602b58c3..90cbe4bb596 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -9,7 +9,7 @@
 #include <linux/nmi.h>
 #include <linux/quicklist.h>
 
-void __show_mem(unsigned int filter)
+void show_mem(unsigned int filter)
 {
 	pg_data_t *pgdat;
 	unsigned long total = 0, reserved = 0, shared = 0,
@@ -61,8 +61,3 @@ void __show_mem(unsigned int filter)
 		quicklist_total_size());
 #endif
 }
-
-void show_mem(void)
-{
-	__show_mem(0);
-}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 62a5cec08a1..6a819d1b2c7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 	task_unlock(current);
 	dump_stack();
 	mem_cgroup_print_oom_info(mem, p);
-	__show_mem(SHOW_MEM_FILTER_NODES);
+	show_mem(SHOW_MEM_FILTER_NODES);
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(mem, nodemask);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8e5726ab0d8..d6e7ba7373b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2195,7 +2195,7 @@ nopage:
 			current->comm, order, gfp_mask);
 		dump_stack();
 		if (!should_suppress_show_mem())
-			__show_mem(filter);
+			show_mem(filter);
 	}
 	return page;
 got_pg:
-- 
cgit v1.2.3-70-g09d2