From 5bfd7560979062ad75c9805c1719cec990b5db29 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@gmail.com>
Date: Sun, 5 Jul 2009 12:08:19 -0700
Subject: Fix virt_to_phys() warnings

These warnings were observed on MIPS32 using 2.6.31-rc1 and gcc-4.2.0:

mm/page_alloc.c: In function 'alloc_pages_exact':
mm/page_alloc.c:1986: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast

drivers/usb/mon/mon_bin.c: In function 'mon_alloc_buff':
drivers/usb/mon/mon_bin.c:1264: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast

[akpm@linux-foundation.org: fix kernel/perf_counter.c too]
Signed-off-by: Kevin Cernekee <cernekee@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0f2cdf9d8b..ad7cd1c56b0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1983,7 +1983,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 
-		split_page(virt_to_page(addr), order);
+		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
-- 
cgit v1.2.3-70-g09d2


From 264ef8a904943ed7d0b04fa958894d7a5c2b2c61 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 7 Jul 2009 10:33:01 +0100
Subject: kmemleak: Remove alloc_bootmem annotations introduced in the past

kmemleak_alloc() calls were added in some places where alloc_bootmem was
called. Since now kmemleak tracks bootmem allocations, these explicit
calls should be run.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/pid.c    |  7 -------
 mm/page_alloc.c | 14 +++-----------
 2 files changed, 3 insertions(+), 18 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/kernel/pid.c b/kernel/pid.c
index 5fa1db48d8b..31310b5d3f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,7 +36,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
-#include <linux/kmemleak.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -513,12 +512,6 @@ void __init pidhash_init(void)
 	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));
 	if (!pid_hash)
 		panic("Could not alloc pidhash!\n");
-	/*
-	 * pid_hash contains references to allocated struct pid objects and it
-	 * must be scanned by kmemleak to avoid false positives.
-	 */
-	kmemleak_alloc(pid_hash, pidhash_size *	sizeof(*(pid_hash)), 0,
-		       GFP_KERNEL);
 	for (i = 0; i < pidhash_size; i++)
 		INIT_HLIST_HEAD(&pid_hash[i]);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad7cd1c56b0..3ef628845f0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4745,8 +4745,10 @@ void *__init alloc_large_system_hash(const char *tablename,
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
-			if (get_order(size) < MAX_ORDER)
+			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
+				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 
@@ -4764,16 +4766,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 
-	/*
-	 * If hashdist is set, the table allocation is done with __vmalloc()
-	 * which invokes the kmemleak_alloc() callback. This function may also
-	 * be called before the slab and kmemleak are initialised when
-	 * kmemleak simply buffers the request to be executed later
-	 * (GFP_ATOMIC flag ignored in this case).
-	 */
-	if (!hashdist)
-		kmemleak_alloc(table, size, 1, GFP_ATOMIC);
-
 	return table;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 9 Jul 2009 14:52:32 +0200
Subject: Fix congestion_wait() sync/async vs read/write confusion

Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke
the bdi congestion wait queue logic, causing us to wait on congestion
for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/x86/lib/usercopy_32.c  |  2 +-
 drivers/block/pktcdvd.c     | 10 ++++++----
 drivers/md/dm-crypt.c       |  2 +-
 fs/fat/file.c               |  2 +-
 fs/fuse/dev.c               |  8 ++++----
 fs/nfs/write.c              |  8 +++++---
 fs/reiserfs/journal.c       |  2 +-
 fs/xfs/linux-2.6/kmem.c     |  4 ++--
 fs/xfs/linux-2.6/xfs_buf.c  |  2 +-
 include/linux/backing-dev.h |  6 +++---
 include/linux/blkdev.h      |  8 ++++----
 mm/backing-dev.c            |  7 +++----
 mm/memcontrol.c             |  2 +-
 mm/page-writeback.c         |  8 ++++----
 mm/page_alloc.c             |  4 ++--
 mm/vmscan.c                 |  8 ++++----
 16 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7c8ca91bb9e..1f118d462ac 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -751,7 +751,7 @@ survive:
 
 			if (retval == -ENOMEM && is_global_init(current)) {
 				up_read(&current->mm->mmap_sem);
-				congestion_wait(WRITE, HZ/50);
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
 				goto survive;
 			}
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 83650e00632..99a506f619b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1372,8 +1372,10 @@ try_next_bio:
 	wakeup = (pd->write_congestion_on > 0
 	 		&& pd->bio_queue_size <= pd->write_congestion_off);
 	spin_unlock(&pd->lock);
-	if (wakeup)
-		clear_bdi_congested(&pd->disk->queue->backing_dev_info, WRITE);
+	if (wakeup) {
+		clear_bdi_congested(&pd->disk->queue->backing_dev_info,
+					BLK_RW_ASYNC);
+	}
 
 	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
 	pkt_set_state(pkt, PACKET_WAITING_STATE);
@@ -2592,10 +2594,10 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 	spin_lock(&pd->lock);
 	if (pd->write_congestion_on > 0
 	    && pd->bio_queue_size >= pd->write_congestion_on) {
-		set_bdi_congested(&q->backing_dev_info, WRITE);
+		set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC);
 		do {
 			spin_unlock(&pd->lock);
-			congestion_wait(WRITE, HZ);
+			congestion_wait(BLK_RW_ASYNC, HZ);
 			spin_lock(&pd->lock);
 		} while(pd->bio_queue_size > pd->write_congestion_off);
 	}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9933eb861c7..529e2ba505c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		 * But don't wait if split was due to the io size restriction
 		 */
 		if (unlikely(out_of_pages))
-			congestion_wait(WRITE, HZ/100);
+			congestion_wait(BLK_RW_ASYNC, HZ/100);
 
 		/*
 		 * With async crypto it is unsafe to share the crypto context
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b28ea646ff6..f042b965c95 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -134,7 +134,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	if ((filp->f_mode & FMODE_WRITE) &&
 	     MSDOS_SB(inode->i_sb)->options.flush) {
 		fat_flush_inodes(inode->i_sb, inode, NULL);
-		congestion_wait(WRITE, HZ/10);
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 	return 0;
 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f58ecbc416c..6484eb75acd 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -286,8 +286,8 @@ __releases(&fc->lock)
 		}
 		if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
 		    fc->connected && fc->bdi_initialized) {
-			clear_bdi_congested(&fc->bdi, READ);
-			clear_bdi_congested(&fc->bdi, WRITE);
+			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+			clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
 		}
 		fc->num_background--;
 		fc->active_background--;
@@ -414,8 +414,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 		fc->blocked = 1;
 	if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
 	    fc->bdi_initialized) {
-		set_bdi_congested(&fc->bdi, READ);
-		set_bdi_congested(&fc->bdi, WRITE);
+		set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+		set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
 	}
 	list_add_tail(&req->list, &fc->bg_queue);
 	flush_bg_queue(fc);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ce728829f79..0a0a2ff767c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -202,8 +202,10 @@ static int nfs_set_page_writeback(struct page *page)
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
 		if (atomic_long_inc_return(&nfss->writeback) >
-				NFS_CONGESTION_ON_THRESH)
-			set_bdi_congested(&nfss->backing_dev_info, WRITE);
+				NFS_CONGESTION_ON_THRESH) {
+			set_bdi_congested(&nfss->backing_dev_info,
+						BLK_RW_ASYNC);
+		}
 	}
 	return ret;
 }
@@ -215,7 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
 
 	end_page_writeback(page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 
 /*
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 77f5bb746bf..90622200b39 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 	DEFINE_WAIT(wait);
 	struct reiserfs_journal *j = SB_JOURNAL(s);
 	if (atomic_read(&j->j_async_throttle))
-		congestion_wait(WRITE, HZ / 10);
+		congestion_wait(BLK_RW_ASYNC, HZ / 10);
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 1cd3b55ee3d..2d3f90afe5f 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
 					__func__, lflags);
-		congestion_wait(WRITE, HZ/50);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
 
@@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
 					__func__, lflags);
-		congestion_wait(WRITE, HZ/50);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 1418b916fc2..0c93c7ef3d1 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -412,7 +412,7 @@ _xfs_buf_lookup_pages(
 
 			XFS_STATS_INC(xb_page_retries);
 			xfsbufd_wakeup(0, gfp_mask);
-			congestion_wait(WRITE, HZ/50);
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0ec2c594868..3a52a63c135 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -229,9 +229,9 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 				  (1 << BDI_async_congested));
 }
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
-void set_bdi_congested(struct backing_dev_info *bdi, int rw);
-long congestion_wait(int rw, long timeout);
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
+void set_bdi_congested(struct backing_dev_info *bdi, int sync);
+long congestion_wait(int sync, long timeout);
 
 
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49ae07951d5..bb3d3997870 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -779,18 +779,18 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
-static inline void blk_clear_queue_congested(struct request_queue *q, int rw)
+static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
 {
-	clear_bdi_congested(&q->backing_dev_info, rw);
+	clear_bdi_congested(&q->backing_dev_info, sync);
 }
 
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
-static inline void blk_set_queue_congested(struct request_queue *q, int rw)
+static inline void blk_set_queue_congested(struct request_queue *q, int sync)
 {
-	set_bdi_congested(&q->backing_dev_info, rw);
+	set_bdi_congested(&q->backing_dev_info, sync);
 }
 
 extern void blk_start_queue(struct request_queue *q);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 493b468a503..c86edd24429 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
 
-
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
@@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested);
 
 /**
  * congestion_wait - wait for a backing_dev to become uncongested
- * @rw: READ or WRITE
+ * @sync: SYNC or ASYNC IO
  * @timeout: timeout in jiffies
  *
  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
  * write congestion.  If no backing_devs are congested then just wait for the
  * next write to be completed.
  */
-long congestion_wait(int rw, long timeout)
+long congestion_wait(int sync, long timeout)
 {
 	long ret;
 	DEFINE_WAIT(wait);
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
+	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2fa20dadf4..e717964cb5a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1973,7 +1973,7 @@ try_to_free:
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
-			congestion_wait(WRITE, HZ/10);
+			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 
 	}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7687879253b..81627ebcd31 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -575,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
-		congestion_wait(WRITE, HZ/10);
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 
 	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -669,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                 if (global_page_state(NR_UNSTABLE_NFS) +
 			global_page_state(NR_WRITEBACK) <= dirty_thresh)
                         	break;
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/*
 		 * The caller might hold locks which can prevent IO completion
@@ -715,7 +715,7 @@ static void background_writeout(unsigned long _min_pages)
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
 			if (wbc.encountered_congestion || wbc.more_io)
-				congestion_wait(WRITE, HZ/10);
+				congestion_wait(BLK_RW_ASYNC, HZ/10);
 			else
 				break;
 		}
@@ -787,7 +787,7 @@ static void wb_kupdate(unsigned long arg)
 		writeback_inodes(&wbc);
 		if (wbc.nr_to_write > 0) {
 			if (wbc.encountered_congestion || wbc.more_io)
-				congestion_wait(WRITE, HZ/10);
+				congestion_wait(BLK_RW_ASYNC, HZ/10);
 			else
 				break;	/* All the old data is written */
 		}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad7cd1c56b0..a35eeab2724 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1666,7 +1666,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 			preferred_zone, migratetype);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
-			congestion_wait(WRITE, HZ/50);
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 
 	return page;
@@ -1831,7 +1831,7 @@ rebalance:
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
-		congestion_wait(WRITE, HZ/50);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 54155268dfc..dea7abd3109 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1104,7 +1104,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		 */
 		if (nr_freed < nr_taken && !current_is_kswapd() &&
 		    lumpy_reclaim) {
-			congestion_wait(WRITE, HZ/10);
+			congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 			/*
 			 * The attempt at page out may have made some
@@ -1721,7 +1721,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 		/* Take a nap, wait for some writeback to complete */
 		if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
-			congestion_wait(WRITE, HZ/10);
+			congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (!sc->all_unreclaimable && scanning_global_lru(sc))
@@ -1960,7 +1960,7 @@ loop_again:
 		 * another pass across the zones.
 		 */
 		if (total_scanned && priority < DEF_PRIORITY - 2)
-			congestion_wait(WRITE, HZ/10);
+			congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/*
 		 * We do this so kswapd doesn't build up large priorities for
@@ -2233,7 +2233,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 				goto out;
 
 			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-				congestion_wait(WRITE, HZ / 10);
+				congestion_wait(BLK_RW_ASYNC, HZ / 10);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From e084b2d95e48b31aa45f9c49ffc6cdae8bdb21d4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 29 Jul 2009 15:02:04 -0700
Subject: page-allocator: preserve PFN ordering when __GFP_COLD is set

Fix a post-2.6.24 performace regression caused by
3dfa5721f12c3d5a441448086bee156887daa961 ("page-allocator: preserve PFN
ordering when __GFP_COLD is set").

Narayanan reports "The regression is around 15%.  There is no disk controller
as our setup is based on Samsung OneNAND used as a memory mapped device on a
OMAP2430 based board."

The page allocator tries to preserve contiguous PFN ordering when returning
pages such that repeated callers to the allocator have a strong chance of
getting physically contiguous pages, particularly when external fragmentation
is low.  However, of the bulk of the allocations have __GFP_COLD set as they
are due to aio_read() for example, then the PFNs are in reverse PFN order.
This can cause performance degration when used with IO controllers that could
have merged the requests.

This patch attempts to preserve the contiguous ordering of PFNs for users of
__GFP_COLD.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Narayananu Gopalakrishnan <narayanan.g@samsung.com>
Tested-by: Narayanan Gopalakrishnan <narayanan.g@samsung.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caa92689aac..ae28c22a7fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -882,7 +882,7 @@ retry_reserve:
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
 			unsigned long count, struct list_head *list,
-			int migratetype)
+			int migratetype, int cold)
 {
 	int i;
 	
@@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
-		list_add(&page->lru, list);
+		if (likely(cold == 0))
+			list_add(&page->lru, list);
+		else
+			list_add_tail(&page->lru, list);
 		set_page_private(page, migratetype);
 		list = &page->lru;
 	}
@@ -1119,7 +1122,8 @@ again:
 		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list, migratetype);
+					pcp->batch, &pcp->list,
+					migratetype, cold);
 			if (unlikely(!pcp->count))
 				goto failed;
 		}
@@ -1138,7 +1142,8 @@ again:
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list, migratetype);
+					pcp->batch, &pcp->list,
+					migratetype, cold);
 			page = list_entry(pcp->list.next, struct page, lru);
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 6583bb64fc370842b32a87c67750c26f6d559af0 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 29 Jul 2009 15:02:06 -0700
Subject: mm: avoid endless looping for oom killed tasks

If a task is oom killed and still cannot find memory when trying with
no watermarks, it's better to fail the allocation attempt than to loop
endlessly.  Direct reclaim has already failed and the oom killer will
be a no-op since current has yet to die, so there is no other
alternative for allocations that are not __GFP_NOFAIL.

Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae28c22a7fd..2dbb2fc6883 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1794,6 +1794,10 @@ rebalance:
 	if (p->flags & PF_MEMALLOC)
 		goto nopage;
 
+	/* Avoid allocations with no watermarks from looping endlessly */
+	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+		goto nopage;
+
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
-- 
cgit v1.2.3-70-g09d2


From 1fc28b70fe2dbf87e061b6ce5091a1f8e4e5d4e7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 29 Jul 2009 15:04:08 -0700
Subject: page-allocator: allow too high-order warning messages to be
 suppressed with __GFP_NOWARN

The page allocator warns once when an order >= MAX_ORDER is specified.
This is to catch callers of the allocator that are always falling back to
their worst-case when it was not expected.  However, there are cases where
the caller is behaving correctly but cannot suppress the warning.  This
patch allows the warning to be suppressed by the callers by specifying
__GFP_NOWARN.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2dbb2fc6883..d052abbe306 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1745,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
-	if (WARN_ON_ONCE(order >= MAX_ORDER))
+	if (order >= MAX_ORDER) {
+		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
+	}
 
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
-- 
cgit v1.2.3-70-g09d2