From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:05 -0800 Subject: [PATCH] mm: split highorder pages Have an explicit mm call to split higher order pages into individual pages. Should help to avoid bugs and be more explicit about the code's intention. Signed-off-by: Nick Piggin Cc: Russell King Cc: David Howells Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Zankel Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 85e80a57db2..6af555c1c42 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1221,9 +1221,7 @@ out: * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself - * (which is mainly an issue of doing "set_page_count(page, 1)" for - * each sub-page, and then freeing them one by one when you free - * them rather than freeing it as a compound page). + * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow -- cgit v1.2.3-70-g09d2 From b7ab795b7bec9997d4fde39f249d52823d36d98d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:42 -0800 Subject: [PATCH] mm: more CONFIG_DEBUG_VM Put a few more checks under CONFIG_DEBUG_VM Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 +++--------- mm/rmap.c | 9 ++++----- 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 6af555c1c42..71bc664efed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -388,7 +388,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ { unsigned long pfn = pte_pfn(pte); - if (vma->vm_flags & VM_PFNMAP) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; @@ -396,18 +396,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } - /* - * Add some anal sanity checks for now. Eventually, - * we should just do "return pfn_to_page(pfn)", but - * in the meantime we check that we get a valid pfn, - * and that the resulting page looks ok. - * - * Remove this test eventually! - */ +#ifdef CONFIG_DEBUG_VM if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); return NULL; } +#endif /* * NOTE! We still have PageReserved() pages in the page diff --git a/mm/rmap.c b/mm/rmap.c index 134aef9d66c..1963e269314 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,13 +56,11 @@ #include -//#define RMAP_DEBUG /* can be enabled only for debugging */ - struct kmem_cache *anon_vma_cachep; static inline void validate_anon_vma(struct vm_area_struct *find_vma) { -#ifdef RMAP_DEBUG +#ifdef CONFIG_DEBUG_VM struct anon_vma *anon_vma = find_vma->anon_vma; struct vm_area_struct *vma; unsigned int mapcount = 0; @@ -551,13 +549,14 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (page_mapcount(page) < 0) { +#ifdef CONFIG_DEBUG_VM + if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); } - +#endif BUG_ON(page_mapcount(page) < 0); /* * It would be tidy to reset the PageAnon mapping here, -- cgit v1.2.3-70-g09d2 From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:57 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() free_pgtables() has special logic to call hugetlb_free_pgd_range() instead of the normal free_pgd_range() on hugepage VMAs. However, the test it uses to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized range at the start of the vma. is_hugepage_only_range() will return true if the given range has any intersection with a hugepage address region, and in this case the given region need not be hugepage aligned. So, for example, this test can return true if called on, say, a 4k VMA immediately preceding a (nicely aligned) hugepage VMA. At present we get away with this because the powerpc version of hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the only other arch with a non-trivial is_hugepage_only_range()) we get away with it for a different reason; the hugepage area is not contiguous with the rest of the user address space, and VMAs are not permitted in between, so the test can't return a false positive there. Nonetheless this should be fixed. We do that in the patch below by replacing the is_hugepage_only_range() test with an explicit test of the VMA using is_vm_hugetlb_page(). This in turn changes behaviour for platforms where is_hugepage_only_range() returns false always (everything except powerpc and ia64). We address this by ensuring that hugetlb_free_pgd_range() is defined to be identical to free_pgd_range() (instead of a no-op) on everything except ia64. Even so, it will prevent some otherwise possible coalescing of calls down to free_pgd_range(). Since this only happens for hugepage VMAs, removing this small optimization seems unlikely to cause any trouble. This patch causes no regressions on the libhugetlbfs testsuite - ppc64 POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP). Signed-off-by: David Gibson Cc: William Lee Irwin III Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/page.h | 1 + include/asm-powerpc/pgtable.h | 5 ----- include/linux/hugetlb.h | 9 +++++---- mm/memory.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'mm/memory.c') diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 5e6362a786b..732cf308674 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -57,6 +57,7 @@ # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA # define ARCH_HAS_HUGEPAGE_ONLY_RANGE +# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE #endif /* CONFIG_HUGETLB_PAGE */ #ifdef __ASSEMBLY__ diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h index e38931379a7..185ee15963a 100644 --- a/include/asm-powerpc/pgtable.h +++ b/include/asm-powerpc/pgtable.h @@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -#ifdef CONFIG_HUGETLB_PAGE -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - free_pgd_range(tlb, addr, end, floor, ceiling) -#endif - /* * This gets called at the end of handling a page fault, when * the kernel has put a new PTE into the page table for the process. diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5d84c368ffe..e465fbf1ef5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,8 +43,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#endif + +#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE +#define hugetlb_free_pgd_range free_pgd_range #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE @@ -93,8 +95,7 @@ static inline unsigned long hugetlb_total_pages(void) #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/memory.c b/mm/memory.c index 71bc664efed..f6e3be9cbf5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(vma)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); -- cgit v1.2.3-70-g09d2 From 4866920b93fd7d5b520278c3c76e6f4d5a352d81 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:58 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() harder Turns out the hugepage logic in free_pgtables() was doubly broken. The loop coalescing multiple normal page VMAs into one call to free_pgd_range() had an off by one error, which could mean it would coalesce one hugepage VMA into the same bundle (checking 'vma' not 'next' in the loop). I transferred this bug into the new is_vm_hugetlb_page() based version. Here's the fix. This one didn't bite on powerpc previously for the same reason the is_hugepage_only_range() problem didn't: powerpc's hugetlb_free_pgd_range() is identical to free_pgd_range(). It didn't bite on ia64 because the hugepage region is distant enough from any other region that the separated PMD_SIZE distance test would always prevent coalescing the two together. No libhugetlbfs testsuite regressions (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index f6e3be9cbf5..80c3fb370f9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -285,7 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(vma)) { + && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); -- cgit v1.2.3-70-g09d2 From 315ab19a6d12d6af7b6957090822f3057ab7e80f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 25 Mar 2006 16:20:22 +0100 Subject: [PATCH] mm: restore vm_normal_page check Hugh is rightly concerned that the CONFIG_DEBUG_VM coverage has gone too far in vm_normal_page, considering that we expect production kernels to be shipped with the option turned off, and that the code has been under some large changes recently. Signed-off-by: Nick Piggin Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 80c3fb370f9..e347e106ca3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } -#ifdef CONFIG_DEBUG_VM + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + */ if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); return NULL; } -#endif /* * NOTE! We still have PageReserved() pages in the page -- cgit v1.2.3-70-g09d2 From 5bcb28b139cffc736177ceb775d1c8b5c5a411e2 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 26 Mar 2006 18:30:52 +0200 Subject: BUG_ON() Conversion in mm/memory.c this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- mm/memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e347e106ca3..d90ff9d0495 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2352,10 +2352,8 @@ int make_pages_present(unsigned long addr, unsigned long end) if (!vma) return -1; write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); -- cgit v1.2.3-70-g09d2 From 03beb07664d768db97bf454ae5c9581cd4737bb4 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 26 Mar 2006 01:36:57 -0800 Subject: [PATCH] Add API for flushing Anon pages Currently, get_user_pages() returns fully coherent pages to the kernel for anything other than anonymous pages. This is a problem for things like fuse and the SCSI generic ioctl SG_IO which can potentially wish to do DMA to anonymous pages passed in by users. The fix is to add a new memory management API: flush_anon_page() which is used in get_user_pages() to make anonymous pages coherent. Signed-off-by: James Bottomley Cc: Russell King Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cachetlb.txt | 9 +++++++++ include/linux/highmem.h | 6 ++++++ mm/memory.c | 2 ++ 3 files changed, 17 insertions(+) (limited to 'mm/memory.c') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 4ae418889b8..1f312a9893d 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -362,6 +362,15 @@ maps this page at its virtual address. likely that you will need to flush the instruction cache for copy_to_user_page(). + void flush_anon_page(struct page *page, unsigned long vmaddr) + When the kernel needs to access the contents of an anonymous + page, it calls this function (currently only + get_user_pages()). Note: flush_dcache_page() deliberately + doesn't work for an anonymous page. The default + implementation is a nop (and should remain so for all coherent + architectures). For incoherent architectures, it should flush + the cache of the page at vmaddr in the current user process. + void flush_icache_range(unsigned long start, unsigned long end) When the kernel stores into addresses that it will execute out of (eg when loading modules), this function is called. diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6bece9280eb..7bd2593dbef 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -7,6 +7,12 @@ #include +#ifndef ARCH_HAS_FLUSH_ANON_PAGE +static inline void flush_anon_page(struct page *page, unsigned long vmaddr) +{ +} +#endif + #ifdef CONFIG_HIGHMEM #include diff --git a/mm/memory.c b/mm/memory.c index e347e106ca3..67686048f09 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1071,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (pages) { pages[i] = page; + + flush_anon_page(page, start); flush_dcache_page(page); } if (vmas) -- cgit v1.2.3-70-g09d2