From a482289d46587ffcda4c85aab109fb74910d7a48 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:08 -0800 Subject: [PATCH] hugepage allocator cleanup Insert "fresh" huge pages into the hugepage allocator by the same means as they are freed back into it. This reduces code size and allows enqueue_huge_page to be inlined into the hugepage free fastpath. Eliminate occurances of hugepages on the free list with non-zero refcount. This can allow stricter refcount checks in future. Also required for lockless pagecache. Signed-off-by: Nick Piggin "This patch also eliminates a leak "cleaned up" by re-clobbering the refcount on every allocation from the hugepage freelists. With respect to the lockless pagecache, the crucial aspect is to eliminate unconditional set_page_count() to 0 on pages with potentially nonzero refcounts, though closer inspection suggests the assignments removed are entirely spurious." Acked-by: William Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 508707704d2..39d49ecea8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -64,7 +64,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } -static struct page *alloc_fresh_huge_page(void) +static int alloc_fresh_huge_page(void) { static int nid = 0; struct page *page; @@ -72,12 +72,15 @@ static struct page *alloc_fresh_huge_page(void) HUGETLB_PAGE_ORDER); nid = (nid + 1) % num_online_nodes(); if (page) { + page[1].lru.next = (void *)free_huge_page; /* dtor */ spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ + return 1; } - return page; + return 0; } void free_huge_page(struct page *page) @@ -85,7 +88,6 @@ void free_huge_page(struct page *page) BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); - page[1].lru.next = NULL; /* reset dtor */ spin_lock(&hugetlb_lock); enqueue_huge_page(page); @@ -105,7 +107,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) } spin_unlock(&hugetlb_lock); set_page_count(page, 1); - page[1].lru.next = (void *)free_huge_page; /* set dtor */ for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) clear_user_highpage(&page[i], addr); return page; @@ -114,7 +115,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) static int __init hugetlb_init(void) { unsigned long i; - struct page *page; if (HPAGE_SHIFT == 0) return 0; @@ -123,12 +123,8 @@ static int __init hugetlb_init(void) INIT_LIST_HEAD(&hugepage_freelists[i]); for (i = 0; i < max_huge_pages; ++i) { - page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) break; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } max_huge_pages = free_huge_pages = nr_huge_pages = i; printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); @@ -154,8 +150,8 @@ static void update_and_free_page(struct page *page) page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); - set_page_count(&page[i], 0); } + page[1].lru.next = NULL; set_page_count(page, 1); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -188,12 +184,8 @@ static inline void try_to_free_low(unsigned long count) static unsigned long set_max_huge_pages(unsigned long count) { while (count > nr_huge_pages) { - struct page *page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) return nr_huge_pages; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } if (count >= nr_huge_pages) return nr_huge_pages; -- cgit v1.2.3-70-g09d2 From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:40 -0800 Subject: [PATCH] remove set_page_count() outside mm/ set_page_count usage outside mm/ is limited to setting the refcount to 1. Remove set_page_count from outside mm/, and replace those users with init_page_count() and set_page_refcounted(). This allows more debug checking, and tighter control on how code is allowed to play around with page->_count. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 2 +- arch/arm/mm/init.c | 2 +- arch/arm26/mm/init.c | 2 +- arch/cris/mm/init.c | 2 +- arch/frv/mm/init.c | 6 +++--- arch/h8300/mm/init.c | 4 ++-- arch/i386/mm/init.c | 6 +++--- arch/ia64/mm/init.c | 6 +++--- arch/m32r/mm/init.c | 4 ++-- arch/m68k/mm/init.c | 2 +- arch/m68k/mm/memory.c | 2 +- arch/m68k/mm/motorola.c | 2 +- arch/m68knommu/mm/init.c | 4 ++-- arch/mips/arc/memory.c | 2 +- arch/mips/dec/prom/memory.c | 2 +- arch/mips/mips-boards/generic/memory.c | 2 +- arch/mips/mips-boards/sim/sim_mem.c | 2 +- arch/mips/mm/init.c | 6 +++--- arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/parisc/mm/init.c | 4 ++-- arch/powerpc/mm/init_32.c | 4 ++-- arch/powerpc/mm/init_64.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/powerpc/platforms/cell/setup.c | 2 +- arch/ppc/mm/init.c | 6 +++--- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/sh64/mm/init.c | 4 ++-- arch/sparc/kernel/sun4d_smp.c | 6 +++--- arch/sparc/kernel/sun4m_smp.c | 6 +++--- arch/sparc/mm/init.c | 6 +++--- arch/sparc64/mm/init.c | 4 ++-- arch/um/kernel/mem.c | 4 ++-- arch/x86_64/mm/init.c | 6 +++--- arch/xtensa/mm/init.c | 2 +- drivers/video/acornfb.c | 2 +- include/linux/mm.h | 11 +++++++++-- mm/hugetlb.c | 5 +++-- mm/internal.h | 13 ++++++++++++- mm/page_alloc.c | 14 ++++++-------- 40 files changed, 96 insertions(+), 79 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 486d7945583..544ac5dc09e 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end) void *__start = start; for (; __start < end; __start += PAGE_SIZE) { ClearPageReserved(virt_to_page(__start)); - set_page_count(virt_to_page(__start), 1); + init_page_count(virt_to_page(__start)); free_page((long)__start); totalram_pages++; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 8b276ee38ac..b0321e943b7 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c index 1f09a9d0fb8..e3ecaa45374 100644 --- a/arch/arm26/mm/init.c +++ b/arch/arm26/mm/init.c @@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 31a0018b525..b7842ff213a 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -216,7 +216,7 @@ free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 765088ea8a5..8899aa1a4f0 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -169,7 +169,7 @@ void __init mem_init(void) struct page *page = &mem_map[pfn]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; } @@ -210,7 +210,7 @@ void __init free_initmem(void) /* next to check that the page we free is not a partial page */ for (addr = start; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1e0929ddc8c..09efc4b1f03 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -219,7 +219,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2700f01994b..7ba55a6e2db 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) static void __meminit free_new_highpage(struct page *page) { - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -727,7 +727,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)addr, 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b38b6d213c1..08d94e6bfa1 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -197,7 +197,7 @@ free_initmem (void) eaddr = (unsigned long) ia64_imva(__init_end); while (addr < eaddr) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); ++totalram_pages; addr += PAGE_SIZE; @@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end) continue; page = virt_to_page(start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(start); ++totalram_pages; } @@ -640,7 +640,7 @@ mem_init (void) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index 6facf15b04f..c9e7dad860b 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -226,7 +226,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index c45beb95594..a190e39c907 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index 559942ce0e1..d6d582a5abb 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable) /* unreserve the page so it's possible to free that page */ PD_PAGE(dp)->flags &= ~(1 << PG_reserved); - set_page_count(PD_PAGE(dp), 1); + init_page_count(PD_PAGE(dp)); return; } diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d855fec2631..afb57eeafdc 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -276,7 +276,7 @@ void free_initmem(void) addr = (unsigned long)&__init_begin; for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) { virt_to_page(addr)->flags &= ~(1 << PG_reserved); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c index 89f0b554ffb..d79503fe6e4 100644 --- a/arch/m68knommu/mm/init.c +++ b/arch/m68knommu/mm/init.c @@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -218,7 +218,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c index 958d2eb7886..8a9ef58cc39 100644 --- a/arch/mips/arc/memory.c +++ b/arch/mips/arc/memory.c @@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c index 81cb5a76cfb..1edaf3074ee 100644 --- a/arch/mips/dec/prom/memory.c +++ b/arch/mips/dec/prom/memory.c @@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void) addr = PAGE_SIZE; while (addr < end) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; } diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c index 2c8afd77a20..ee5e70c95cf 100644 --- a/arch/mips/mips-boards/generic/memory.c +++ b/arch/mips/mips-boards/generic/memory.c @@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c index 0dbd7435bb2..1ec4e75656b 100644 --- a/arch/mips/mips-boards/sim/sim_mem.c +++ b/arch/mips/mips-boards/sim/sim_mem.c @@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index a140da9732d..52f7d59fe61 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -245,7 +245,7 @@ void __init mem_init(void) #ifdef CONFIG_LIMITED_DMA set_page_address(page, lowmem_page_address(page)); #endif - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -292,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -315,7 +315,7 @@ void free_initmem(void) page = addr; #endif ClearPageReserved(virt_to_page(page)); - set_page_count(virt_to_page(page), 1); + init_page_count(virt_to_page(page)); free_page(page); totalram_pages++; freed += PAGE_SIZE; diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index ed93a979295..e0d095daa5e 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -559,7 +559,7 @@ void __init mem_init(void) /* if (!page_is_ram(pgnr)) continue; */ /* commented out until page_is_ram works */ ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 7847ca13d6c..852eda3953d 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -398,7 +398,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); num_physpages++; totalram_pages++; @@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); num_physpages++; totalram_pages++; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 7d0d75c1184..b57fb3a2b7b 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 81cfb0c2ec5..bacb71c8981 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -140,7 +140,7 @@ void free_initmem(void) for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { memset((void *)addr, 0xcc, PAGE_SIZE); ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 6ae5c130d0d..454cac01d8c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL(phys_mem_access_prot); void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -376,7 +376,7 @@ void __init mem_init(void) struct page *page = pfn_to_page(pfn); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index b33a4443f5a..fec8e65b36e 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe, for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); set_page_links(page, ZONE_DMA, node_id, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c index 134db5c0420..cb1c294fb93 100644 --- a/arch/ppc/mm/init.c +++ b/arch/ppc/mm/init.c @@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -441,7 +441,7 @@ void __init mem_init(void) struct page *page = mem_map + pfn; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index df953383724..a055894f3bd 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -292,7 +292,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index e342565f75f..77b4a838fe1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -273,7 +273,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c index a65e8bb2c3c..1169757fb38 100644 --- a/arch/sh64/mm/init.c +++ b/arch/sh64/mm/init.c @@ -173,7 +173,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 40d426cce82..4219dd2ce3a 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void) /* Free unneeded trap tables */ ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index a21f27d10e5..fbbd8a474c4 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void) /* Free unneeded trap tables */ if (!cpu_isset(i, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; } if (!cpu_isset(2, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; } if (!cpu_isset(3, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index c03babaa049..89866973246 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn) struct page *page = pfn_to_page(tmp); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -480,7 +480,7 @@ void free_initmem (void) p = virt_to_page(addr); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; num_physpages++; @@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; } diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index c2b556106fc..2ae143ba50d 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1461,7 +1461,7 @@ void free_initmem(void) p = virt_to_page(page); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; @@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index fa4f915be5c..92cce96b5e2 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start, for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){ page = &mem_map[highmem_pfn + i]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); } } @@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 7af1742aa95..40ed13d263c 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -592,7 +592,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 5a91d6c9e66..e1be4235f36 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end) { for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page((unsigned long)start); totalram_pages++; } diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index b058273527b..76448d6ae89 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end) */ page = virt_to_page(virtual_start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(virtual_start); virtual_start += PAGE_SIZE; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d84b7a35e0..7d8c127daad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -307,8 +307,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -#define set_page_count(p,v) atomic_set(&(p)->_count, (v)) - extern void FASTCALL(__page_cache_release(struct page *)); static inline int page_count(struct page *page) @@ -325,6 +323,15 @@ static inline void get_page(struct page *page) atomic_inc(&page->_count); } +/* + * Setup the page count before being freed into the page allocator for + * the first time (boot or memory hotplug) + */ +static inline void init_page_count(struct page *page) +{ + atomic_set(&page->_count, 1); +} + void put_page(struct page *page); void split_page(struct page *page, unsigned int order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39d49ecea8e..20117a4b8ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include +#include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages; @@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) return NULL; } spin_unlock(&hugetlb_lock); - set_page_count(page, 1); + set_page_refcounted(page); for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) clear_user_highpage(&page[i], addr); return page; @@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page) 1 << PG_private | 1<< PG_writeback); } page[1].lru.next = NULL; - set_page_count(page, 1); + set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } diff --git a/mm/internal.h b/mm/internal.h index 7bb33977981..d20e3cc4aef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,8 +13,19 @@ #include -static inline void set_page_refs(struct page *page, int order) +static inline void set_page_count(struct page *page, int v) { + atomic_set(&page->_count, v); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) +{ + BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e197818a7cf..7f65b5a63bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -442,7 +442,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) if (order == 0) { __ClearPageReserved(page); set_page_count(page, 0); - set_page_refs(page, 0); + set_page_refcounted(page); __free_page(page); } else { int loop; @@ -457,7 +457,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) set_page_count(p, 0); } - set_page_refs(page, order); + set_page_refcounted(page); __free_pages(page, order); } } @@ -525,7 +525,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refs(page, order); + set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); return 0; } @@ -755,10 +755,8 @@ void split_page(struct page *page, unsigned int order) BUG_ON(PageCompound(page)); BUG_ON(!page_count(page)); - for (i = 1; i < (1 << order); i++) { - BUG_ON(page_count(page + i)); - set_page_count(page + i, 1); - } + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); } /* @@ -1771,7 +1769,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); -- cgit v1.2.3-70-g09d2 From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 22 Mar 2006 00:08:50 -0800 Subject: [PATCH] Enable mprotect on huge pages 2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Ken Chen Signed-off-by: Nishanth Aravamudan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/pgtable.h | 5 ++--- include/asm-ia64/pgtable.h | 2 +- include/asm-x86_64/pgtable.h | 4 ++-- include/linux/hugetlb.h | 4 ++++ mm/hugetlb.c | 29 +++++++++++++++++++++++++++++ mm/mprotect.c | 12 +++++------- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 088a945bf26..ee056c41a9f 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -219,13 +219,12 @@ extern unsigned long pg0[]; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT) static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } -static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } /* * The following only works if pte_present() is not true. @@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= __LARGE_PTE; return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } #ifdef CONFIG_X86_PAE # include diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index e2560c58384..5890972a69b 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr) #define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A)) #define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D)) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D)) -#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_P)) +#define pte_mkhuge(pte) (__pte(pte_val(pte))) /* * Macro to a page protection value as "uncacheable". Note that "protection" is really a diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 715fd94cf57..a617d364d08 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } -static inline int pte_huge(pte_t pte) { return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } @@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } struct vm_area_struct; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 68d82ad6b17..fa83836b63d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,6 +41,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot); #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 @@ -101,6 +103,8 @@ static inline unsigned long hugetlb_total_pages(void) #define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) +#define hugetlb_change_protection(vma, address, end, newprot) + #ifndef HPAGE_MASK #define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */ #define HPAGE_SIZE PAGE_SIZE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20117a4b8ab..783098f6cf8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i; } + +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + spin_lock(&mm->page_table_lock); + for (; address < end; address += HPAGE_SIZE) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + if (!pte_none(*ptep)) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(pte_modify(pte, newprot)); + set_huge_pte_at(mm, address, ptep, pte); + lazy_mmu_prot_update(pte); + } + } + spin_unlock(&mm->page_table_lock); + + flush_tlb_range(vma, start, end); +} + diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b8571c1e..4c14d4289b6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; @@ -166,7 +166,10 @@ success: */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; - change_protection(vma, start, end, newprot); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection(vma, start, end, newprot); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; @@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - if (is_vm_hugetlb_page(vma)) { - error = -EACCES; - goto out; - } - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ -- cgit v1.2.3-70-g09d2 From 79ac6ba40eb8d70f0d204e98ae9b63280ad1018c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:51 -0800 Subject: [PATCH] hugepage: Small fixes to hugepage clear/copy path Move the loops used in mm/hugetlb.c to clear and copy hugepages to their own functions for clarity. As we do so, we add some checks of need_resched - we are, after all copying megabytes of memory here. We also add might_sleep() accordingly. We generally dropped locks around the clear and copy, already but not everyone has PREEMPT enabled, so we should still be checking explicitly. For this to work, we need to remove the clear_huge_page() from alloc_huge_page(), which is called with the page_table_lock held in the COW path. We move the clear_huge_page() to just after the alloc_huge_page() in the hugepage no-page path. In the COW path, the new page is about to be copied over, so clearing it was just a waste of time anyway. So as a side effect we also fix the fact that we held the page_table_lock for far too long in this path by calling alloc_huge_page() under it. It causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 783098f6cf8..41b1038f76d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,29 @@ static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static void clear_huge_page(struct page *page, unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + cond_resched(); + clear_user_highpage(page + i, addr); + } +} + +static void copy_huge_page(struct page *dst, struct page *src, + unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); + } +} + /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ @@ -98,7 +121,6 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; - int i; spin_lock(&hugetlb_lock); page = dequeue_huge_page(vma, addr); @@ -108,8 +130,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) } spin_unlock(&hugetlb_lock); set_page_refcounted(page); - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_user_highpage(&page[i], addr); return page; } @@ -367,7 +387,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte) { struct page *old_page, *new_page; - int i, avoidcopy; + int avoidcopy; old_page = pte_page(pte); @@ -388,9 +408,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } spin_unlock(&mm->page_table_lock); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) - copy_user_highpage(new_page + i, old_page + i, - address + i*PAGE_SIZE); + copy_huge_page(new_page, old_page, address); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -435,6 +453,7 @@ retry: ret = VM_FAULT_OOM; goto out; } + clear_huge_page(page, address); if (vma->vm_flags & VM_SHARED) { int err; -- cgit v1.2.3-70-g09d2 From 3935baa9bcda3ccaee4f7849f5157d316e34412e Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:53 -0800 Subject: [PATCH] hugepage: serialize hugepage allocation and instantiation Currently, no lock or mutex is held between allocating a hugepage and inserting it into the pagetables / page cache. When we do go to insert the page into pagetables or page cache, we recheck and may free the newly allocated hugepage. However, since the number of hugepages in the system is strictly limited, and it's usualy to want to use all of them, this can still lead to spurious allocation failures. For example, suppose two processes are both mapping (MAP_SHARED) the same hugepage file, large enough to consume the entire available hugepage pool. If they race instantiating the last page in the mapping, they will both attempt to allocate the last available hugepage. One will fail, of course, returning OOM from the fault and thus causing the process to be killed, despite the fact that the entire mapping can, in fact, be instantiated. The patch fixes this race by the simple method of adding a (sleeping) mutex to serialize the hugepage fault path between allocation and insertion into pagetables and/or page cache. It would be possible to avoid the serialization by catching the allocation failures, waiting on some condition, then rechecking to see if someone else has instantiated the page for us. Given the likely frequency of hugepage instantiations, it seems very doubtful it's worth the extra complexity. This patch causes no regression on the libhugetlbfs testsuite, and one test, which can trigger this race now passes where it previously failed. Actually, the test still sometimes fails, though less often and only as a shmat() failure, rather processes getting OOM killed by the VM. The dodgy heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage space aren't protected by the new mutex, and would be ugly to do so, so there's still a race there. Another patch to replace those tests with something saner for this reason as well as others coming... Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41b1038f76d..d5987a87bbe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,10 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +/* + * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + */ +static DEFINE_SPINLOCK(hugetlb_lock); static void clear_huge_page(struct page *page, unsigned long addr) { @@ -50,11 +55,6 @@ static void copy_huge_page(struct page *dst, struct page *src, } } -/* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages - */ -static DEFINE_SPINLOCK(hugetlb_lock); - static void enqueue_huge_page(struct page *page) { int nid = page_to_nid(page); @@ -508,14 +508,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) return VM_FAULT_OOM; + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mutex_lock(&hugetlb_instantiation_mutex); entry = *ptep; - if (pte_none(entry)) - return hugetlb_no_page(mm, vma, address, ptep, write_access); + if (pte_none(entry)) { + ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + mutex_unlock(&hugetlb_instantiation_mutex); + return ret; + } ret = VM_FAULT_MINOR; @@ -525,6 +535,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); + mutex_unlock(&hugetlb_instantiation_mutex); return ret; } -- cgit v1.2.3-70-g09d2 From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:55 -0800 Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 74 ++++++++------------------ include/linux/hugetlb.h | 8 ++- mm/hugetlb.c | 136 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 154 insertions(+), 64 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b3519528994..1a1c2fcb782 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } -/* - * huge_pages_needed tries to determine the number of new huge pages that - * will be required to fully populate this VMA. This will be equal to - * the size of the VMA in huge pages minus the number of huge pages - * (covered by this VMA) that are found in the page cache. - * - * Result is in bytes to be compatible with is_hugepage_mem_enough() - */ -static unsigned long -huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) -{ - int i; - struct pagevec pvec; - unsigned long start = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); - pgoff_t endpg = next + hugepages; - - pagevec_init(&pvec, 0); - while (next < endpg) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (page->index > next) - next = page->index; - if (page->index >= endpg) - break; - next++; - hugepages--; - } - huge_pagevec_release(&pvec); - } - return hugepages << HPAGE_SHIFT; -} - static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - unsigned long bytes; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; @@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; - bytes = huge_pages_needed(mapping, vma); - if (!is_hugepage_mem_enough(bytes)) - return -ENOMEM; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); mutex_lock(&inode->i_mutex); @@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) goto out; + if (vma->vm_flags & VM_MAYSHARE) + if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0) + goto out; + ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); if (inode->i_size < len) @@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page) put_page(page); } -static void truncate_hugepages(struct address_space *mapping, loff_t lstart) +static void truncate_hugepages(struct inode *inode, loff_t lstart) { + struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> HPAGE_SHIFT; struct pagevec pvec; pgoff_t next; int i; + hugetlb_truncate_reservation(HUGETLBFS_I(inode), + lstart >> HPAGE_SHIFT); + if (!mapping->nrpages) + return; pagevec_init(&pvec, 0); next = start; while (1) { @@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); } @@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); destroy_inode(inode); } @@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); spin_unlock(&mapping->i_mmap_lock); - truncate_hugepages(mapping, offset); + truncate_hugepages(inode, offset); return 0; } @@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + p->prereserved_hpages = 0; return &p->vfs_inode; } @@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size) if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!is_hugepage_mem_enough(size)) - return ERR_PTR(-ENOMEM); - if (!user_shm_lock(size, current->user)) return ERR_PTR(-ENOMEM); @@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size) if (!inode) goto out_file; + error = -ENOMEM; + if (hugetlb_extend_reservation(HUGETLBFS_I(inode), + size >> HPAGE_SHIFT) != 0) + goto out_inode; + d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; @@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size) file->f_mode = FMODE_WRITE | FMODE_READ; return file; +out_inode: + iput(inode); out_file: put_filp(file); out_dentry: diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fa83836b63d..cafe73eecb0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long) int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); -int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); void free_huge_page(struct page *); @@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) #define unmap_hugepage_range(vma, start, end) BUG() -#define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL @@ -132,6 +130,8 @@ struct hugetlbfs_sb_info { struct hugetlbfs_inode_info { struct shared_policy policy; + /* Protected by the (global) hugetlb_lock */ + unsigned long prereserved_hpages; struct inode vfs_inode; }; @@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_zero_setup(size_t); +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast_hpages); +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost_hpages); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5987a87bbe..27fad5d9bcf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,7 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; @@ -120,17 +120,136 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { + struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; + int use_reserve = 0; + unsigned long idx; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(vma, addr); - if (!page) { - spin_unlock(&hugetlb_lock); - return NULL; + + if (vma->vm_flags & VM_MAYSHARE) { + + /* idx = radix tree index, i.e. offset into file in + * HPAGE_SIZE units */ + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* The hugetlbfs specific inode info stores the number + * of "guaranteed available" (huge) pages. That is, + * the first 'prereserved_hpages' pages of the inode + * are either already instantiated, or have been + * pre-reserved (by hugetlb_reserve_for_inode()). Here + * we're in the process of instantiating the page, so + * we use this to determine whether to draw from the + * pre-reserved pool or the truly free pool. */ + if (idx < HUGETLBFS_I(inode)->prereserved_hpages) + use_reserve = 1; + } + + if (!use_reserve) { + if (free_huge_pages <= reserved_huge_pages) + goto fail; + } else { + BUG_ON(reserved_huge_pages == 0); + reserved_huge_pages--; } + + page = dequeue_huge_page(vma, addr); + if (!page) + goto fail; + spin_unlock(&hugetlb_lock); set_page_refcounted(page); return page; + + fail: + WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ + spin_unlock(&hugetlb_lock); + return NULL; +} + +/* hugetlb_extend_reservation() + * + * Ensure that at least 'atleast' hugepages are, and will remain, + * available to instantiate the first 'atleast' pages of the given + * inode. If the inode doesn't already have this many pages reserved + * or instantiated, set aside some hugepages in the reserved pool to + * satisfy later faults (or fail now if there aren't enough, rather + * than getting the SIGBUS later). + */ +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast) +{ + struct inode *inode = &info->vfs_inode; + unsigned long change_in_reserve = 0; + int ret = 0; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages >= atleast) + goto out; + + /* Because we always call this on shared mappings, none of the + * pages beyond info->prereserved_hpages can have been + * instantiated, so we need to reserve all of them now. */ + change_in_reserve = atleast - info->prereserved_hpages; + + if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { + ret = -ENOMEM; + goto out; + } + + reserved_huge_pages += change_in_reserve; + info->prereserved_hpages = atleast; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); + + return ret; +} + +/* hugetlb_truncate_reservation() + * + * This returns pages reserved for the given inode to the general free + * hugepage pool. If the inode has any pages prereserved, but not + * instantiated, beyond offset (atmost << HPAGE_SIZE), then release + * them. + */ +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost) +{ + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long idx; + unsigned long change_in_reserve = 0; + struct page *page; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages <= atmost) + goto out; + + /* Count pages which were reserved, but not instantiated, and + * which we can now release. */ + for (idx = atmost; idx < info->prereserved_hpages; idx++) { + page = radix_tree_lookup(&mapping->page_tree, idx); + if (!page) + /* Pages which are already instantiated can't + * be unreserved (and in fact have already + * been removed from the reserved pool) */ + change_in_reserve++; + } + + BUG_ON(reserved_huge_pages < change_in_reserve); + reserved_huge_pages -= change_in_reserve; + info->prereserved_hpages = atmost; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); } static int __init hugetlb_init(void) @@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, + reserved_huge_pages, HPAGE_SIZE/1024); } @@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, free_huge_pages_node[nid]); } -int is_hugepage_mem_enough(size_t size) -{ - return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; -} - /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { -- cgit v1.2.3-70-g09d2 From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:56 -0800 Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local Originally, mm/hugetlb.c just handled the hugepage physical allocation path and its {alloc,free}_huge_page() functions were used from the arch specific hugepage code. These days those functions are only used with mm/hugetlb.c itself. Therefore, this patch makes them static and removes their prototypes from hugetlb.h. This requires a small rearrangement of code in mm/hugetlb.c to avoid a forward declaration. This patch causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ---- mm/hugetlb.c | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cafe73eecb0..5d84c368ffe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -21,8 +21,6 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); unsigned long hugetlb_total_pages(void); -struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); -void free_huge_page(struct page *); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -97,8 +95,6 @@ static inline unsigned long hugetlb_total_pages(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ do { } while (0) -#define alloc_huge_page(vma, addr) ({ NULL; }) -#define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27fad5d9bcf..075877b1cbc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + static int alloc_fresh_huge_page(void) { static int nid = 0; @@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void) return 0; } -void free_huge_page(struct page *page) -{ - BUG_ON(page_count(page)); - - INIT_LIST_HEAD(&page->lru); - - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); -} - -struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; -- cgit v1.2.3-70-g09d2 From d5d4b0aa4e1430d73050babba999365593bdb9d2 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 22 Mar 2006 00:09:03 -0800 Subject: [PATCH] optimize follow_hugetlb_page follow_hugetlb_page() walks a range of user virtual address and then fills in list of struct page * into an array that is passed from the argument list. It also gets a reference count via get_page(). For compound page, get_page() actually traverse back to head page via page_private() macro and then adds a reference count to the head page. Since we are doing a virt to pte look up, kernel already has a struct page pointer into the head page. So instead of traverse into the small unit page struct and then follow a link back to the head page, optimize that with incrementing the reference count directly on the head page. The benefit is that we don't take a cache miss on accessing page struct for the corresponding user address and more importantly, not to pollute the cache with a "not very useful" round trip of pointer chasing. This adds a moderate performance gain on an I/O intensive database transaction workload. Signed-off-by: Ken Chen Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 075877b1cbc..06699d871a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -661,10 +661,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) { - unsigned long vpfn, vaddr = *position; + unsigned long pfn_offset; + unsigned long vaddr = *position; int remainder = *length; - vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -692,19 +692,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (pages) { - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - get_page(page); - pages[i] = page; - } + pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + page = pte_page(*pte); +same_page: + get_page(page); + if (pages) + pages[i] = page + pfn_offset; if (vmas) vmas[i] = vma; vaddr += PAGE_SIZE; - ++vpfn; + ++pfn_offset; --remainder; ++i; + if (vaddr < vma->vm_end && remainder && + pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + /* + * We use pfn_offset to avoid touching the pageframes + * of this compound page. + */ + goto same_page; + } } spin_unlock(&mm->page_table_lock); *length = remainder; -- cgit v1.2.3-70-g09d2 From fdb7cc59084ba7eef935e4e40aaaf538ee34c625 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 22 Mar 2006 00:09:10 -0800 Subject: [PATCH] mm: hugetlb alloc_fresh_huge_page bogus node loop fix Fix bogus node loop in hugetlb.c alloc_fresh_huge_page(), which was assuming that nodes are numbered contiguously from 0 to num_online_nodes(). Once the hotplug folks get this far, that will be false. Signed-off-by: Paul Jackson Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 06699d871a8..ebad6bbb350 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -105,7 +105,9 @@ static int alloc_fresh_huge_page(void) struct page *page; page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); - nid = (nid + 1) % num_online_nodes(); + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); if (page) { page[1].lru.next = (void *)free_huge_page; /* dtor */ spin_lock(&hugetlb_lock); -- cgit v1.2.3-70-g09d2