diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 142 |
1 files changed, 128 insertions, 14 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91b81b6367..c0327380718 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2,7 +2,6 @@ * Generic hugetlb support. * (C) William Irwin, April 2004 */ -#include <linux/gfp.h> #include <linux/list.h> #include <linux/init.h> #include <linux/module.h> @@ -18,6 +17,10 @@ #include <linux/mutex.h> #include <linux/bootmem.h> #include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -465,11 +474,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -477,11 +488,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -500,7 +511,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, break; } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } @@ -546,7 +559,9 @@ static void free_huge_page(struct page *page) mapping = (struct address_space *) page_private(page); set_page_private(page, 0); + page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -600,6 +615,8 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } +EXPORT_SYMBOL_GPL(PageHuge); + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -1038,7 +1055,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page(h, vma, addr); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-VM_FAULT_SIGBUS); } } @@ -1515,10 +1532,9 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h, - struct kobject *parent, - struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; int hi = h - hstates; @@ -2088,7 +2104,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); } } @@ -2125,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2136,6 +2153,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + return 1; + } else + return 0; +} + void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2194,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pte_none(pte)) continue; + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); @@ -2203,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2268,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2282,8 +2322,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2334,6 +2376,13 @@ retry_avoidcopy: return -PTR_ERR(new_page); } + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2345,11 +2394,19 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ + mmu_notifier_invalidate_range_start(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; + mmu_notifier_invalidate_range_end(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); } page_cache_release(new_page); page_cache_release(old_page); @@ -2448,8 +2505,29 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - } else + page_dup_rmap(page); + } else { lock_page(page); + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); + } + } else { + page_dup_rmap(page); + } + + /* + * Since memory error handler replaces pte into hwpoison swap entry + * at the time of error handling, a process which reserved but not have + * the mapping to the error hugepage does not have hwpoison swap entry. + * So we need to block accesses from such a process by checking + * PG_hwpoison bit here. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON; + goto backout_unlocked; } /* @@ -2501,10 +2579,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON; + } + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -2542,6 +2628,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + * Note that locking order is always pagecache_page -> page, + * so no worry about deadlock. + */ + page = pte_page(entry); + if (page != pagecache_page) + lock_page(page); + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) @@ -2559,7 +2656,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); out_page_table_lock: spin_unlock(&mm->page_table_lock); @@ -2568,6 +2665,7 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); @@ -2779,3 +2877,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +void __isolate_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + spin_lock(&hugetlb_lock); + list_del(&hpage->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + spin_unlock(&hugetlb_lock); +} |