From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 10 Mar 2011 08:52:07 +0100
Subject: block: remove per-queue plugging

Code has been converted over to the new explicit on-stack plugging,
and delay users have been converted to use the new API for that.
So lets kill off the old plugging along with aops->sync_page().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17497d0cd8b..251bed73ac0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
 static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
-	lock_page_nosync(page);
+	lock_page(page);
 	if (page_mapping(page) == mapping)
 		mapping_set_error(mapping, error);
 	unlock_page(page);
-- 
cgit v1.2.3-70-g09d2


From d527caf22e48480b102c7c6ee5b9ba12170148f7 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Mar 2011 16:30:38 -0700
Subject: mm: compaction: prevent kswapd compacting memory to reduce CPU usage

This patch reverts 5a03b051 ("thp: use compaction in kswapd for GFP_ATOMIC
order > 0") due to reports stating that kswapd CPU usage was higher and
IRQs were being disabled more frequently.  This was reported at
http://www.spinics.net/linux/fedora/alsa-user/msg09885.html.

Without this patch applied, CPU usage by kswapd hovers around the 20% mark
according to the tester (Arthur Marsh:
http://www.spinics.net/linux/fedora/alsa-user/msg09899.html).  With this
patch applied, it's around 2%.

The problem is not related to THP which specifies __GFP_NO_KSWAPD but is
triggered by high-order allocations hitting the low watermark for their
order and waking kswapd on kernels with CONFIG_COMPACTION set.  The most
common trigger for this is network cards configured for jumbo frames but
it's also possible it'll be triggered by fork-heavy workloads (order-1)
and some wireless cards which depend on order-1 allocations.

The symptoms for the user will be high CPU usage by kswapd in low-memory
situations which could be confused with another writeback problem.  While
a patch like 5a03b051 may be reintroduced in the future, this patch plays
it safe for now and reverts it.

[mel@csn.ul.ie: Beefed up the changelog]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Arthur Marsh <arthur.marsh@internode.on.net>
Tested-by: Arthur Marsh <arthur.marsh@internode.on.net>
Cc: <stable@kernel.org>		[2.6.38.1]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  9 ++-------
 mm/compaction.c            | 24 +++---------------------
 mm/vmscan.c                | 18 +-----------------
 3 files changed, 6 insertions(+), 45 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index dfa2ed4c0d2..cc9f7a42864 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -11,9 +11,6 @@
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	3
 
-#define COMPACT_MODE_DIRECT_RECLAIM	0
-#define COMPACT_MODE_KSWAPD		1
-
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -28,8 +25,7 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			bool sync);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 extern unsigned long compact_zone_order(struct zone *zone, int order,
-					gfp_t gfp_mask, bool sync,
-					int compact_mode);
+					gfp_t gfp_mask, bool sync);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -74,8 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order)
 }
 
 static inline unsigned long compact_zone_order(struct zone *zone, int order,
-					       gfp_t gfp_mask, bool sync,
-					       int compact_mode)
+					       gfp_t gfp_mask, bool sync)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index 8be430b812d..dcb058bd76c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,8 +42,6 @@ struct compact_control {
 	unsigned int order;		/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-
-	int compact_mode;
 };
 
 static unsigned long release_freepages(struct list_head *freelist)
@@ -397,10 +395,7 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_COMPLETE;
 
 	/* Compaction run is not finished if the watermark is not met */
-	if (cc->compact_mode != COMPACT_MODE_KSWAPD)
-		watermark = low_wmark_pages(zone);
-	else
-		watermark = high_wmark_pages(zone);
+	watermark = low_wmark_pages(zone);
 	watermark += (1 << cc->order);
 
 	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
@@ -413,15 +408,6 @@ static int compact_finished(struct zone *zone,
 	if (cc->order == -1)
 		return COMPACT_CONTINUE;
 
-	/*
-	 * Generating only one page of the right order is not enough
-	 * for kswapd, we must continue until we're above the high
-	 * watermark as a pool for high order GFP_ATOMIC allocations
-	 * too.
-	 */
-	if (cc->compact_mode == COMPACT_MODE_KSWAPD)
-		return COMPACT_CONTINUE;
-
 	/* Direct compactor: Is a suitable page free? */
 	for (order = cc->order; order < MAX_ORDER; order++) {
 		/* Job done if page is free of the right migratetype */
@@ -543,8 +529,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync,
-				 int compact_mode)
+				 bool sync)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -553,7 +538,6 @@ unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
-		.compact_mode = compact_mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -599,8 +583,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync,
-					    COMPACT_MODE_DIRECT_RECLAIM);
+		status = compact_zone_order(zone, order, gfp_mask, sync);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
@@ -631,7 +614,6 @@ static int compact_node(int nid)
 			.nr_freepages = 0,
 			.nr_migratepages = 0,
 			.order = -1,
-			.compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
 		};
 
 		zone = &pgdat->node_zones[zoneid];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6771ea70bfe..3b4a41d7248 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2397,7 +2397,6 @@ loop_again:
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
-			int compaction;
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 
@@ -2428,24 +2427,9 @@ loop_again:
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 
-			compaction = 0;
-			if (order &&
-			    zone_watermark_ok(zone, 0,
-					       high_wmark_pages(zone),
-					      end_zone, 0) &&
-			    !zone_watermark_ok(zone, order,
-					       high_wmark_pages(zone),
-					       end_zone, 0)) {
-				compact_zone_order(zone,
-						   order,
-						   sc.gfp_mask, false,
-						   COMPACT_MODE_KSWAPD);
-				compaction = 1;
-			}
-
 			if (zone->all_unreclaimable)
 				continue;
-			if (!compaction && nr_slab == 0 &&
+			if (nr_slab == 0 &&
 			    !zone_reclaimable(zone))
 				zone->all_unreclaimable = 1;
 			/*
-- 
cgit v1.2.3-70-g09d2


From e64a782fec684c29a8204c51b3cb554dce588592 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 22 Mar 2011 16:32:44 -0700
Subject: mm: change __remove_from_page_cache()

Now we renamed remove_from_page_cache with delete_from_page_cache.  As
consistency of __remove_from_swap_cache and remove_from_swap_cache, we
change internal page cache handling function name, too.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 mm/filemap.c            | 8 ++++----
 mm/memory-failure.c     | 2 +-
 mm/truncate.c           | 2 +-
 mm/vmscan.c             | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ea36ff29a51..29ebba54c23 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -456,7 +456,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
-extern void __remove_from_page_cache(struct page *page);
+extern void __delete_from_page_cache(struct page *page);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 1cfb8fd84b2..cb476e70cf1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -108,11 +108,11 @@
  */
 
 /*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
  * is safe.  The caller must hold the mapping's tree_lock.
  */
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
@@ -154,7 +154,7 @@ void delete_from_page_cache(struct page *page)
 
 	freepage = mapping->a_ops->freepage;
 	spin_lock_irq(&mapping->tree_lock);
-	__remove_from_page_cache(page);
+	__delete_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	mem_cgroup_uncharge_cache_page(page);
 
@@ -444,7 +444,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		new->index = offset;
 
 		spin_lock_irq(&mapping->tree_lock);
-		__remove_from_page_cache(old);
+		__delete_from_page_cache(old);
 		error = radix_tree_insert(&mapping->page_tree, offset, new);
 		BUG_ON(error);
 		mapping->nrpages++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 99ccb447262..e0af336530c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1130,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 
 	/*
 	 * Now take care of user space mappings.
-	 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+	 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
 	 */
 	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
 		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
diff --git a/mm/truncate.c b/mm/truncate.c
index 7c617b5f03e..3d2ae1f423d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -388,7 +388,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 
 	clear_page_mlock(page);
 	BUG_ON(page_has_private(page));
-	__remove_from_page_cache(page);
+	__delete_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	mem_cgroup_uncharge_cache_page(page);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3b4a41d7248..665b090b6c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -514,7 +514,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 
 		freepage = mapping->a_ops->freepage;
 
-		__remove_from_page_cache(page);
+		__delete_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		mem_cgroup_uncharge_cache_page(page);
 
-- 
cgit v1.2.3-70-g09d2


From 8afdcece4911e51cfff2b50a269418914cab8a3f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 22 Mar 2011 16:33:04 -0700
Subject: mm: vmscan: kswapd should not free an excessive number of pages when
 balancing small zones

When reclaiming for order-0 pages, kswapd requires that all zones be
balanced.  Each cycle through balance_pgdat() does background ageing on
all zones if necessary and applies equal pressure on the inactive zone
unless a lot of pages are free already.

A "lot of free pages" is defined as a "balance gap" above the high
watermark which is currently 7*high_watermark.  Historically this was
reasonable as min_free_kbytes was small.  However, on systems using huge
pages, it is recommended that min_free_kbytes is higher and it is tuned
with hugeadm --set-recommended-min_free_kbytes.  With the introduction of
transparent huge page support, this recommended value is also applied.  On
X86-64 with 4G of memory, min_free_kbytes becomes 67584 so one would
expect around 68M of memory to be free.  The Normal zone is approximately
35000 pages so under even normal memory pressure such as copying a large
file, it gets exhausted quickly.  As it is getting exhausted, kswapd
applies pressure equally to all zones, including the DMA32 zone.  DMA32 is
approximately 700,000 pages with a high watermark of around 23,000 pages.
In this situation, kswapd will reclaim around (23000*8 where 8 is the high
watermark + balance gap of 7 * high watermark) pages or 718M of pages
before the zone is ignored.  What the user sees is that free memory far
higher than it should be.

To avoid an excessive number of pages being reclaimed from the larger
zones, explicitely defines the "balance gap" to be either 1% of the zone
or the low watermark for the zone, whichever is smaller.  While kswapd
will check all zones to apply pressure, it'll ignore zones that meets the
(high_wmark + balance_gap) watermark.

To test this, 80G were copied from a partition and the amount of memory
being used was recorded.  A comparison of a patch and unpatched kernel can
be seen at
http://www.csn.ul.ie/~mel/postings/minfree-20110222/memory-usage-hydra.ps
and shows that kswapd is not reclaiming as much memory with the patch
applied.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: "Chen, Tim C" <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  9 +++++++++
 mm/vmscan.c          | 16 +++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c335055c425..ed6ebe690f4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -155,6 +155,15 @@ enum {
 #define SWAP_CLUSTER_MAX 32
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
+/*
+ * Ratio between the present memory in the zone and the "gap" that
+ * we're allowing kswapd to shrink in addition to the per-zone high
+ * wmark, even for zones that already have the high wmark satisfied,
+ * in order to provide better per-zone lru behavior. We are ok to
+ * spend not more than 1% of the memory for this zone balancing "gap".
+ */
+#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
+
 #define SWAP_MAP_MAX	0x3e	/* Max duplication count, in first swap_map */
 #define SWAP_MAP_BAD	0x3f	/* Note pageblock is bad, in first swap_map */
 #define SWAP_HAS_CACHE	0x40	/* Flag page is cached, in first swap_map */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 665b090b6c7..060e4c19140 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2399,6 +2399,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
+			unsigned long balance_gap;
 
 			if (!populated_zone(zone))
 				continue;
@@ -2415,11 +2416,20 @@ loop_again:
 			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
 
 			/*
-			 * We put equal pressure on every zone, unless one
-			 * zone has way too many pages free already.
+			 * We put equal pressure on every zone, unless
+			 * one zone has way too many pages free
+			 * already. The "too many pages" is defined
+			 * as the high wmark plus a "gap" where the
+			 * gap is either the low watermark or 1%
+			 * of the zone, whichever is smaller.
 			 */
+			balance_gap = min(low_wmark_pages(zone),
+				(zone->present_pages +
+					KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+				KSWAPD_ZONE_BALANCE_GAP_RATIO);
 			if (!zone_watermark_ok_safe(zone, order,
-					8*high_wmark_pages(zone), end_zone, 0))
+					high_wmark_pages(zone) + balance_gap,
+					end_zone, 0))
 				shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
-- 
cgit v1.2.3-70-g09d2