From f7e839dd36fd940b0202cfb7d39b2a1b2dc59b1b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:20 -0700 Subject: readahead: move max_sane_readahead() calls into force_page_cache_readahead() Impact: code simplification. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 1b60f30cebf..dcef9fd6b92 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1390,8 +1390,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } -- cgit v1.2.3-70-g09d2 From ef00e08e26dd5d84271ef706262506b82195e752 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 16 Jun 2009 15:31:25 -0700 Subject: readahead: clean up and simplify the code for filemap page fault readahead This shouldn't really change behavior all that much, but the single rather complex function with read-ahead inside a loop etc is broken up into more manageable pieces. The behaviour is also less subtle, with the read-ahead being done up-front rather than inside some subtle loop and thus avoiding the now unnecessary extra state variables (ie "did_readaround" is gone). Fengguang: the code split in fact fixed a bug reported by Pavel Levshin: the PGMAJFAULT accounting used to be bypassed when MADV_RANDOM is set, in which case the original code will directly jump to no_cached_page reading. Cc: Pavel Levshin Cc: Cc: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 156 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 89 insertions(+), 67 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index dcef9fd6b92..82753648559 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma)) { + page_cache_sync_readahead(mapping, ra, file, offset, 1); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + pgoff_t start = 0; + + if (offset > ra_pages / 2) + start = offset - ra_pages / 2; + do_page_cache_readahead(mapping, file, start, ra_pages); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, page, offset, 1); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; - int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + if (offset >= size) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1554,18 +1582,18 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1574,7 +1602,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1594,12 +1622,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, -- cgit v1.2.3-70-g09d2 From 70ac23cfa31f68289d4b720c6162b3929ab4de36 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:28 -0700 Subject: readahead: sequential mmap readahead Auto-detect sequential mmap reads and do readahead for them. The sequential mmap readahead will be triggered when - sync readahead: it's a major fault and (prev_offset == offset-1); - async readahead: minor fault on PG_readahead page with valid readahead state. The benefits of doing readahead instead of read-around: - less I/O wait thanks to async readahead - double real I/O size and no more cache hits The single stream case is improved a little. For 100,000 sequential mmap reads: user system cpu total (1-1) plain -mm, 128KB readaround: 3.224 2.554 48.40% 11.838 (1-2) plain -mm, 256KB readaround: 3.170 2.392 46.20% 11.976 (2) patched -mm, 128KB readahead: 3.117 2.448 47.33% 11.607 The patched (2) has smallest total time, since it has no cache hit overheads and less I/O block time(thanks to async readahead). Here the I/O size makes no much difference, since there's only one single stream. Note that (1-1)'s real I/O size is 64KB and (1-2)'s real I/O size is 128KB, since the half of the read-around pages will be readahead cache hits. This is going to make _real_ differences for _concurrent_ IO streams. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 82753648559..99977f0a94e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1472,7 +1472,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (VM_RandomReadHint(vma)) return; - if (VM_SequentialReadHint(vma)) { + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { page_cache_sync_readahead(mapping, ra, file, offset, 1); return; } -- cgit v1.2.3-70-g09d2 From 2fad6f5deee5556f511eab58da78737a23ddb35d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:29 -0700 Subject: readahead: enforce full readahead size on async mmap readahead We need this in one particular case and two more general ones. Now we do async readahead for sequential mmap reads, and do it with the help of PG_readahead. For normal reads, PG_readahead is the sufficient condition to do a sequential readahead. But unfortunately, for mmap reads, there is a tiny nuisance: [11736.998347] readahead-init0(process: sh/23926, file: sda1/w3m, offset=0:4503599627370495, ra=0+4-3) = 4 [11737.014985] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=290+32-0) = 17 [11737.019488] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=118+32-0) = 32 [11737.024921] readahead-interleaved(process: w3m/23926, file: sda1/w3m, offset=0:2, ra=4+6-6) = 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~ An unfavorably small readahead. The original dumb read-around size could be more efficient. That happened because ld-linux.so does a read(832) in L1 before mmap(), which triggers a 4-page readahead, with the second page tagged PG_readahead. L0: open("/lib/libc.so.6", O_RDONLY) = 3 L1: read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340\342"..., 832) = 832 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L2: fstat(3, {st_mode=S_IFREG|0755, st_size=1420624, ...}) = 0 L3: mmap(NULL, 3527256, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fac6e51d000 L4: mprotect(0x7fac6e671000, 2097152, PROT_NONE) = 0 L5: mmap(0x7fac6e871000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x154000) = 0x7fac6e871000 L6: mmap(0x7fac6e876000, 16984, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fac6e876000 L7: close(3) = 0 In general, the PG_readahead flag will also be hit in cases - sequential reads - clustered random reads A full readahead size is desirable in both cases. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 99977f0a94e..5c0c6518f34 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1516,7 +1516,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > 0) ra->mmap_miss--; if (PageReadahead(page)) - page_cache_async_readahead(mapping, ra, file, page, offset, 1); + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); } /** -- cgit v1.2.3-70-g09d2 From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:30 -0700 Subject: readahead: record mmap read-around states in file_ra_state Mmap read-around now shares the same code style and data structure with readahead code. This also removes do_page_cache_readahead(). Its last user, mmap read-around, has been changed to call ra_submit(). The no-readahead-if-congested logic is dumped by the way. Users will be pretty sensitive about the slow loading of executables. So it's unfavorable to disabled mmap read-around on a congested queue. [akpm@linux-foundation.org: coding-style fixes] Cc: Nick Piggin Signed-off-by: Fengguang Wu Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- mm/filemap.c | 12 +++++++----- mm/readahead.c | 23 ++--------------------- 3 files changed, 12 insertions(+), 28 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index ad613ed66ab..33da7f53884 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1178,8 +1178,6 @@ void task_dirty_inc(struct task_struct *tsk); #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read); @@ -1197,6 +1195,9 @@ void page_cache_async_readahead(struct address_space *mapping, unsigned long size); unsigned long max_sane_readahead(unsigned long nr); +unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, + struct file *filp); /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/mm/filemap.c b/mm/filemap.c index 5c0c6518f34..734891d0663 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1488,13 +1488,15 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > MMAP_LOTSAMISS) return; + /* + * mmap read-around + */ ra_pages = max_sane_readahead(ra->ra_pages); if (ra_pages) { - pgoff_t start = 0; - - if (offset > ra_pages / 2) - start = offset - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); } } diff --git a/mm/readahead.c b/mm/readahead.c index d7c6e143a12..a7f01fcce9e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -133,15 +133,12 @@ out: } /* - * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. - * - * do_page_cache_readahead() returns -1 if it encountered request queue - * congestion. */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -231,22 +228,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return ret; } -/* - * This version skips the IO if the queue is read-congested, and will tell the - * block layer to abandon the readahead if request allocation would block. - * - * force_page_cache_readahead() will ignore queue congestion and will block on - * request queues. - */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) -{ - if (bdi_read_congested(mapping->backing_dev_info)) - return -1; - - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); -} - /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. @@ -260,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) /* * Submit IO for the read-ahead request in file_ra_state. */ -static unsigned long ra_submit(struct file_ra_state *ra, +unsigned long ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { int actual; -- cgit v1.2.3-70-g09d2 From 61b7cbdba2f3c588a0cf3db574c562805454b09b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:36 -0700 Subject: readahead: remove redundant test in shrink_readahead_size_eio() Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 734891d0663..2e9bcc2e797 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } -- cgit v1.2.3-70-g09d2 From 7ffc59b4d0bdfa00e882339f85b8a969bb7021e2 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:38 -0700 Subject: readahead: enforce full sync mmap readahead size Now that we do readahead for sequential mmap reads, here is a simple evaluation of the impacts, and one further optimization. It's an NFS-root debian desktop system, readahead size = 60 pages. The numbers are grabbed after a fresh boot into console. approach pgmajfault RA miss ratio mmap IO count avg IO size(pages) A 383 31.6% 383 11 B 225 32.4% 390 11 C 224 32.6% 307 13 case A: mmap sync/async readahead disabled case B: mmap sync/async readahead enabled, with enforced full async readahead size case C: mmap sync/async readahead enabled, with enforced full sync/async readahead size or: A = vanilla 2.6.30-rc1 B = A plus mmap readahead C = B plus this patch The numbers show that - there are good possibilities for random mmap reads to trigger readahead - 'pgmajfault' is reduced by 1/3, due to the _async_ nature of readahead - case C can further reduce IO count by 1/4 - readahead miss ratios are not quite affected The theory is - readahead is _good_ for clustered random reads, and can perform _better_ than readaround because they could be _async_. - async readahead size is guaranteed to be larger than readaround size, and they are _async_, hence will mostly behave better However for B - sync readahead size could be smaller than readaround size, hence may make things worse by produce more smaller IOs which will be fixed by this patch. Final conclusion: - mmap readahead reduced major faults by 1/3 and no obvious overheads; - mmap io can be further reduced by 1/4 with this patch. Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2e9bcc2e797..6846a902f5c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1471,7 +1471,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (VM_SequentialReadHint(vma) || offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { - page_cache_sync_readahead(mapping, ra, file, offset, 1); + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); return; } -- cgit v1.2.3-70-g09d2 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/sba_iommu.c | 2 +- arch/ia64/kernel/mca.c | 3 +-- arch/ia64/kernel/uncached.c | 3 ++- arch/ia64/sn/pci/pci_dma.c | 3 ++- arch/powerpc/platforms/cell/ras.c | 4 ++-- arch/x86/kvm/vmx.c | 2 +- drivers/misc/sgi-gru/grufile.c | 2 +- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 9 +++++++++ include/linux/mm.h | 1 - kernel/profile.c | 8 ++++---- mm/filemap.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/slab.c | 4 ++-- mm/slob.c | 4 ++-- 17 files changed, 33 insertions(+), 24 deletions(-) (limited to 'mm/filemap.c') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 56ceb68eb99..fe63b2dc9d0 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1131,7 +1131,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp #ifdef CONFIG_NUMA { struct page *page; - page = alloc_pages_node(ioc->node == MAX_NUMNODES ? + page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ? numa_node_id() : ioc->node, flags, get_order(size)); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 8f33a884042..5b17bd40227 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = page_address(alloc_pages_node(numa_node_id(), - GFP_KERNEL, get_order(sz))); + data = __get_free_pages(GFP_KERNEL, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", cpu); diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 8eff8c1d40a..6ba72ab42fc 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d876423e4e7..98b684928e1 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -90,7 +90,8 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, */ node = pcibus_to_node(pdev->bus); if (likely(node >=0)) { - struct page *p = alloc_pages_node(node, flags, get_order(size)); + struct page *p = alloc_pages_exact_node(node, + flags, get_order(size)); if (likely(p)) cpuaddr = page_address(p); diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 296b5268754..5e0a191764f 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -122,8 +122,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_node(area->nid, GFP_KERNEL | GFP_THISNODE, - area->order); + area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE, + area->order); if (!area->pages) { printk(KERN_WARNING "%s: no page on node %d\n", diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32d6ae8fb60..e770bf349ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index bbefe77c67a..3ce2920e2bf 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -302,7 +302,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr) pnode = uv_node_to_pnode(nid); if (bid < 0 || gru_base[bid]) continue; - page = alloc_pages_node(nid, GFP_KERNEL, order); + page = alloc_pages_exact_node(nid, GFP_KERNEL, order); if (!page) goto fail; gru_base[bid] = page_address(page); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 9172fcdee4e..c76677afda1 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -232,7 +232,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, mq->mmr_blade = uv_cpu_to_blade_id(cpu); nid = cpu_to_node(cpu); - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c2d3fe03b5d..4efa33088a8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -5,6 +5,7 @@ #include #include #include +#include struct vm_area_struct; @@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } +static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, + unsigned int order) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); +} + #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); diff --git a/include/linux/mm.h b/include/linux/mm.h index a880161a385..7b548e7cfbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/kernel/profile.c b/kernel/profile.c index 28cf26ad2d2..69911b5745e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -564,14 +564,14 @@ static int create_hash_tables(void) int node = cpu_to_node(cpu); struct page *page; - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) diff --git a/mm/filemap.c b/mm/filemap.c index 6846a902f5c..22396713feb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228..2f8241f300f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 46bdf9ddf2b..e08e2c4da63 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f88..5a24923e7fd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } diff --git a/mm/slab.c b/mm/slab.c index 18e3164de09..bb3254c95cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; @@ -3261,7 +3261,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { diff --git a/mm/slob.c b/mm/slob.c index 12f26149992..64f6db1943b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); -- cgit v1.2.3-70-g09d2