From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 22 Mar 2006 00:08:50 -0800 Subject: [PATCH] Enable mprotect on huge pages 2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Ken Chen Signed-off-by: Nishanth Aravamudan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/hugetlb.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 68d82ad6b17..fa83836b63d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,6 +41,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot); #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 @@ -101,6 +103,8 @@ static inline unsigned long hugetlb_total_pages(void) #define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) +#define hugetlb_change_protection(vma, address, end, newprot) + #ifndef HPAGE_MASK #define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */ #define HPAGE_SIZE PAGE_SIZE -- cgit v1.2.3-70-g09d2 From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:55 -0800 Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 74 ++++++++------------------ include/linux/hugetlb.h | 8 ++- mm/hugetlb.c | 136 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 154 insertions(+), 64 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b3519528994..1a1c2fcb782 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } -/* - * huge_pages_needed tries to determine the number of new huge pages that - * will be required to fully populate this VMA. This will be equal to - * the size of the VMA in huge pages minus the number of huge pages - * (covered by this VMA) that are found in the page cache. - * - * Result is in bytes to be compatible with is_hugepage_mem_enough() - */ -static unsigned long -huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) -{ - int i; - struct pagevec pvec; - unsigned long start = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); - pgoff_t endpg = next + hugepages; - - pagevec_init(&pvec, 0); - while (next < endpg) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (page->index > next) - next = page->index; - if (page->index >= endpg) - break; - next++; - hugepages--; - } - huge_pagevec_release(&pvec); - } - return hugepages << HPAGE_SHIFT; -} - static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - unsigned long bytes; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; @@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; - bytes = huge_pages_needed(mapping, vma); - if (!is_hugepage_mem_enough(bytes)) - return -ENOMEM; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); mutex_lock(&inode->i_mutex); @@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) goto out; + if (vma->vm_flags & VM_MAYSHARE) + if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0) + goto out; + ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); if (inode->i_size < len) @@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page) put_page(page); } -static void truncate_hugepages(struct address_space *mapping, loff_t lstart) +static void truncate_hugepages(struct inode *inode, loff_t lstart) { + struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> HPAGE_SHIFT; struct pagevec pvec; pgoff_t next; int i; + hugetlb_truncate_reservation(HUGETLBFS_I(inode), + lstart >> HPAGE_SHIFT); + if (!mapping->nrpages) + return; pagevec_init(&pvec, 0); next = start; while (1) { @@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); } @@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); destroy_inode(inode); } @@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); spin_unlock(&mapping->i_mmap_lock); - truncate_hugepages(mapping, offset); + truncate_hugepages(inode, offset); return 0; } @@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + p->prereserved_hpages = 0; return &p->vfs_inode; } @@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size) if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!is_hugepage_mem_enough(size)) - return ERR_PTR(-ENOMEM); - if (!user_shm_lock(size, current->user)) return ERR_PTR(-ENOMEM); @@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size) if (!inode) goto out_file; + error = -ENOMEM; + if (hugetlb_extend_reservation(HUGETLBFS_I(inode), + size >> HPAGE_SHIFT) != 0) + goto out_inode; + d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; @@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size) file->f_mode = FMODE_WRITE | FMODE_READ; return file; +out_inode: + iput(inode); out_file: put_filp(file); out_dentry: diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fa83836b63d..cafe73eecb0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long) int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); -int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); void free_huge_page(struct page *); @@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) #define unmap_hugepage_range(vma, start, end) BUG() -#define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL @@ -132,6 +130,8 @@ struct hugetlbfs_sb_info { struct hugetlbfs_inode_info { struct shared_policy policy; + /* Protected by the (global) hugetlb_lock */ + unsigned long prereserved_hpages; struct inode vfs_inode; }; @@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_zero_setup(size_t); +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast_hpages); +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost_hpages); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5987a87bbe..27fad5d9bcf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,7 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; @@ -120,17 +120,136 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { + struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; + int use_reserve = 0; + unsigned long idx; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(vma, addr); - if (!page) { - spin_unlock(&hugetlb_lock); - return NULL; + + if (vma->vm_flags & VM_MAYSHARE) { + + /* idx = radix tree index, i.e. offset into file in + * HPAGE_SIZE units */ + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* The hugetlbfs specific inode info stores the number + * of "guaranteed available" (huge) pages. That is, + * the first 'prereserved_hpages' pages of the inode + * are either already instantiated, or have been + * pre-reserved (by hugetlb_reserve_for_inode()). Here + * we're in the process of instantiating the page, so + * we use this to determine whether to draw from the + * pre-reserved pool or the truly free pool. */ + if (idx < HUGETLBFS_I(inode)->prereserved_hpages) + use_reserve = 1; + } + + if (!use_reserve) { + if (free_huge_pages <= reserved_huge_pages) + goto fail; + } else { + BUG_ON(reserved_huge_pages == 0); + reserved_huge_pages--; } + + page = dequeue_huge_page(vma, addr); + if (!page) + goto fail; + spin_unlock(&hugetlb_lock); set_page_refcounted(page); return page; + + fail: + WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ + spin_unlock(&hugetlb_lock); + return NULL; +} + +/* hugetlb_extend_reservation() + * + * Ensure that at least 'atleast' hugepages are, and will remain, + * available to instantiate the first 'atleast' pages of the given + * inode. If the inode doesn't already have this many pages reserved + * or instantiated, set aside some hugepages in the reserved pool to + * satisfy later faults (or fail now if there aren't enough, rather + * than getting the SIGBUS later). + */ +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast) +{ + struct inode *inode = &info->vfs_inode; + unsigned long change_in_reserve = 0; + int ret = 0; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages >= atleast) + goto out; + + /* Because we always call this on shared mappings, none of the + * pages beyond info->prereserved_hpages can have been + * instantiated, so we need to reserve all of them now. */ + change_in_reserve = atleast - info->prereserved_hpages; + + if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { + ret = -ENOMEM; + goto out; + } + + reserved_huge_pages += change_in_reserve; + info->prereserved_hpages = atleast; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); + + return ret; +} + +/* hugetlb_truncate_reservation() + * + * This returns pages reserved for the given inode to the general free + * hugepage pool. If the inode has any pages prereserved, but not + * instantiated, beyond offset (atmost << HPAGE_SIZE), then release + * them. + */ +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost) +{ + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long idx; + unsigned long change_in_reserve = 0; + struct page *page; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages <= atmost) + goto out; + + /* Count pages which were reserved, but not instantiated, and + * which we can now release. */ + for (idx = atmost; idx < info->prereserved_hpages; idx++) { + page = radix_tree_lookup(&mapping->page_tree, idx); + if (!page) + /* Pages which are already instantiated can't + * be unreserved (and in fact have already + * been removed from the reserved pool) */ + change_in_reserve++; + } + + BUG_ON(reserved_huge_pages < change_in_reserve); + reserved_huge_pages -= change_in_reserve; + info->prereserved_hpages = atmost; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); } static int __init hugetlb_init(void) @@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, + reserved_huge_pages, HPAGE_SIZE/1024); } @@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, free_huge_pages_node[nid]); } -int is_hugepage_mem_enough(size_t size) -{ - return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; -} - /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { -- cgit v1.2.3-70-g09d2 From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:56 -0800 Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local Originally, mm/hugetlb.c just handled the hugepage physical allocation path and its {alloc,free}_huge_page() functions were used from the arch specific hugepage code. These days those functions are only used with mm/hugetlb.c itself. Therefore, this patch makes them static and removes their prototypes from hugetlb.h. This requires a small rearrangement of code in mm/hugetlb.c to avoid a forward declaration. This patch causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ---- mm/hugetlb.c | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cafe73eecb0..5d84c368ffe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -21,8 +21,6 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); unsigned long hugetlb_total_pages(void); -struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); -void free_huge_page(struct page *); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -97,8 +95,6 @@ static inline unsigned long hugetlb_total_pages(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ do { } while (0) -#define alloc_huge_page(vma, addr) ({ NULL; }) -#define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27fad5d9bcf..075877b1cbc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + static int alloc_fresh_huge_page(void) { static int nid = 0; @@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void) return 0; } -void free_huge_page(struct page *page) -{ - BUG_ON(page_count(page)); - - INIT_LIST_HEAD(&page->lru); - - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); -} - -struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; -- cgit v1.2.3-70-g09d2 From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:57 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() free_pgtables() has special logic to call hugetlb_free_pgd_range() instead of the normal free_pgd_range() on hugepage VMAs. However, the test it uses to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized range at the start of the vma. is_hugepage_only_range() will return true if the given range has any intersection with a hugepage address region, and in this case the given region need not be hugepage aligned. So, for example, this test can return true if called on, say, a 4k VMA immediately preceding a (nicely aligned) hugepage VMA. At present we get away with this because the powerpc version of hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the only other arch with a non-trivial is_hugepage_only_range()) we get away with it for a different reason; the hugepage area is not contiguous with the rest of the user address space, and VMAs are not permitted in between, so the test can't return a false positive there. Nonetheless this should be fixed. We do that in the patch below by replacing the is_hugepage_only_range() test with an explicit test of the VMA using is_vm_hugetlb_page(). This in turn changes behaviour for platforms where is_hugepage_only_range() returns false always (everything except powerpc and ia64). We address this by ensuring that hugetlb_free_pgd_range() is defined to be identical to free_pgd_range() (instead of a no-op) on everything except ia64. Even so, it will prevent some otherwise possible coalescing of calls down to free_pgd_range(). Since this only happens for hugepage VMAs, removing this small optimization seems unlikely to cause any trouble. This patch causes no regressions on the libhugetlbfs testsuite - ppc64 POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP). Signed-off-by: David Gibson Cc: William Lee Irwin III Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/page.h | 1 + include/asm-powerpc/pgtable.h | 5 ----- include/linux/hugetlb.h | 9 +++++---- mm/memory.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 5e6362a786b..732cf308674 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -57,6 +57,7 @@ # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA # define ARCH_HAS_HUGEPAGE_ONLY_RANGE +# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE #endif /* CONFIG_HUGETLB_PAGE */ #ifdef __ASSEMBLY__ diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h index e38931379a7..185ee15963a 100644 --- a/include/asm-powerpc/pgtable.h +++ b/include/asm-powerpc/pgtable.h @@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -#ifdef CONFIG_HUGETLB_PAGE -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - free_pgd_range(tlb, addr, end, floor, ceiling) -#endif - /* * This gets called at the end of handling a page fault, when * the kernel has put a new PTE into the page table for the process. diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5d84c368ffe..e465fbf1ef5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,8 +43,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#endif + +#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE +#define hugetlb_free_pgd_range free_pgd_range #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE @@ -93,8 +95,7 @@ static inline unsigned long hugetlb_total_pages(void) #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/memory.c b/mm/memory.c index 71bc664efed..f6e3be9cbf5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(vma)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); -- cgit v1.2.3-70-g09d2 From 3915bcf38fe0b6d130b4bbde97804f29a0becf32 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:59 -0800 Subject: [PATCH] hugepage: Move hugetlb_free_pgd_range() prototype to hugetlb.h The optional hugepage callback, hugetlb_free_pgd_range() is presently implemented non-trivially only on ia64 (but I plan to add one for powerpc shortly). It has its own prototype for the function in asm-ia64/pgtable.h. However, since the function is called from generic code, it make sense for its prototype to be in the generic hugetlb.h header file, as the protypes other arch callbacks already are (prepare_hugepage_range(), set_huge_pte_at(), etc.). This patch makes it so. Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/pgtable.h | 3 --- include/linux/hugetlb.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index 5890972a69b..c0f8144f234 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -505,9 +505,6 @@ extern struct page *zero_page_memmap_ptr; #define HUGETLB_PGDIR_SHIFT (HPAGE_SHIFT + 2*(PAGE_SHIFT-3)) #define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT) #define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1)) -struct mmu_gather; -void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, - unsigned long end, unsigned long floor, unsigned long ceiling); #endif /* diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e465fbf1ef5..5db25ffdb3e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -47,6 +47,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE #define hugetlb_free_pgd_range free_pgd_range +#else +void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, + unsigned long end, unsigned long floor, + unsigned long ceiling); #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -- cgit v1.2.3-70-g09d2 From 42b88befd6e0dae1a5fe04c03925037fa890e1f3 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:09:01 -0800 Subject: [PATCH] hugepage: is_aligned_hugepage_range() cleanup Quite a long time back, prepare_hugepage_range() replaced is_aligned_hugepage_range() as the callback from mm/mmap.c to arch code to verify if an address range is suitable for a hugepage mapping. is_aligned_hugepage_range() stuck around, but only to implement prepare_hugepage_range() on archs which didn't implement their own. Most archs (everything except ia64 and powerpc) used the same implementation of is_aligned_hugepage_range(). On powerpc, which implements its own prepare_hugepage_range(), the custom version was never used. In addition, "is_aligned_hugepage_range()" was a bad name, because it suggests it returns true iff the given range is a good hugepage range, whereas in fact it returns 0-or-error (so the sense is reversed). This patch cleans up by abolishing is_aligned_hugepage_range(). Instead prepare_hugepage_range() is defined directly. Most archs use the default version, which simply checks the given region is aligned to the size of a hugepage. ia64 and powerpc define custom versions. The ia64 one simply checks that the range is in the correct address space region in addition to being suitably aligned. The powerpc version (just as previously) checks for suitable addresses, and if necessary performs low-level MMU frobbing to set up new areas for use by hugepages. No libhugetlbfs testsuite regressions on ppc64 (POWER5 LPAR). Signed-off-by: David Gibson Signed-off-by: Zhang Yanmin Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/hugetlbpage.c | 12 ------------ arch/ia64/mm/hugetlbpage.c | 5 +++-- arch/powerpc/mm/hugetlbpage.c | 15 --------------- arch/sh/mm/hugetlbpage.c | 12 ------------ arch/sh64/mm/hugetlbpage.c | 12 ------------ arch/sparc64/mm/hugetlbpage.c | 12 ------------ include/asm-ia64/page.h | 1 + include/linux/hugetlb.h | 16 ++++++++++++---- 8 files changed, 16 insertions(+), 69 deletions(-) (limited to 'include/linux/hugetlb.h') diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index d524127c9af..a7d89158541 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 2d13889d0a9..9dbc7dadd16 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } /* - * This function checks for proper alignment of input addr and len parameters. + * Don't actually need to do any preparation, but need to make sure + * the address is in the right region. */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +int prepare_hugepage_range(unsigned long addr, unsigned long len) { if (len & ~HPAGE_MASK) return -EINVAL; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b51bb28c054..7370f9f33e2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return __pte(old); } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - if (! (within_hugepage_low_range(addr, len) - || within_hugepage_high_range(addr, len)) ) - return -EINVAL; - return 0; -} - struct slb_flush_info { struct mm_struct *mm; u16 newareas; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 6b7a7688c98..a3568fd5150 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index ed6a505b3ee..3d89f2a6c78 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index a7a24869d04..280dc7958a1 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -263,18 +263,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 732cf308674..3ab27333dae 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -57,6 +57,7 @@ # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA # define ARCH_HAS_HUGEPAGE_ONLY_RANGE +# define ARCH_HAS_PREPARE_HUGEPAGE_RANGE # define ARCH_HAS_HUGETLB_FREE_PGD_RANGE #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5db25ffdb3e..d6f1019625a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -36,7 +36,6 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); -int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); @@ -54,8 +53,18 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -#define prepare_hugepage_range(addr, len) \ - is_aligned_hugepage_range(addr, len) +/* + * If the arch doesn't supply something else, assume that hugepage + * size aligned regions are ok without further preparation. + */ +static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + return 0; +} #else int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif @@ -95,7 +104,6 @@ static inline unsigned long hugetlb_total_pages(void) #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL -#define is_aligned_hugepage_range(addr, len) 0 #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -- cgit v1.2.3-70-g09d2