summaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c484
1 files changed, 446 insertions, 38 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f902e20e8c..6001ee6347a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,15 @@
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
+#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/migrate.h>
+
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +163,77 @@ static int start_khugepaged(void)
return err;
}
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);
+
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return 0;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
+}
+
+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
#ifdef CONFIG_SYSFS
static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);
+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
+ &use_zero_page_attr.attr,
#ifdef CONFIG_DEBUG_VM
&debug_cow_attr.attr,
#endif
@@ -484,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!*hugepage_kobj)) {
- printk(KERN_ERR "hugepage: failed kobject create\n");
+ printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
if (err) {
- printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
goto delete_obj;
}
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
if (err) {
- printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
goto remove_hp_group;
}
@@ -550,6 +640,8 @@ static int __init hugepage_init(void)
goto out;
}
+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -599,7 +691,7 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
@@ -678,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
+{
+ pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
+ return true;
+}
+
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
@@ -691,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ unsigned long zero_pfn;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {
@@ -717,7 +849,8 @@ out:
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -755,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
goto out_unlock;
}
+ /*
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ unsigned long zero_pfn;
+ bool set;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
+ BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
+ ret = 0;
+ goto out_unlock;
+ }
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
spin_unlock(&src_mm->page_table_lock);
@@ -806,6 +959,80 @@ unlock:
spin_unlock(&mm->page_table_lock);
}
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_page;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
+ inc_mm_counter(mm, MM_ANONPAGES);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+out_free_page:
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
+}
+
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -912,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
int ret = 0;
- struct page *page, *new_page;
+ struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
- haddr = address & HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -936,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
get_page(page);
spin_unlock(&mm->page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -946,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
- ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
- pmd, orig_pmd, page, haddr);
- if (ret & VM_FAULT_OOM)
- split_huge_page(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
+ address, pmd, orig_pmd, haddr);
+ } else {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
+ put_page(page);
+ }
goto out;
}
count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
- split_huge_page(page);
- put_page(page);
+ if (page) {
+ split_huge_page(page);
+ put_page(page);
+ }
ret |= VM_FAULT_OOM;
goto out;
}
- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ if (is_huge_zero_pmd(orig_pmd))
+ clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
mmun_start = haddr;
@@ -971,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
- put_page(page);
+ if (page)
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
@@ -979,14 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mn;
} else {
pmd_t entry;
- VM_BUG_ON(!PageHead(page));
entry = mk_huge_pmd(new_page, vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- page_remove_rmap(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ } else {
+ VM_BUG_ON(!PageHead(page));
+ page_remove_rmap(page);
+ put_page(page);
+ }
ret |= VM_FAULT_WRITE;
}
spin_unlock(&mm->page_table_lock);
@@ -1044,6 +1289,81 @@ out:
return page;
}
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+ bool page_locked = false;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+ get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ put_page(page);
+ goto clear_pmdnuma;
+ }
+
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+ page_locked = true;
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr,
+ page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+ else {
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ goto out_unlock;
+ }
+ goto clear_pmdnuma;
+ }
+
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+
+clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ if (page_locked)
+ unlock_page(page);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1055,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
pgtable = pgtable_trans_huge_withdraw(tlb->mm);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
- page = pmd_page(orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ }
pte_free(tlb->mm, pgtable);
ret = 1;
}
@@ -1126,7 +1452,7 @@ out:
}
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot)
+ unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
@@ -1134,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
+ if (!prot_numa) {
+ entry = pmd_modify(entry, newprot);
+ BUG_ON(pmd_write(entry));
+ } else {
+ struct page *page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
+ !pmd_numa(*pmd)) {
+ entry = pmd_mknuma(entry);
+ }
+ }
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
@@ -1224,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->mutex to
+ * and it won't wait on the anon_vma->root->rwsem to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
@@ -1315,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
+ page_xchg_last_nid(page_tail, page_last_nid(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1382,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -1424,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
@@ -1477,10 +1817,21 @@ int split_huge_page(struct page *page)
struct anon_vma *anon_vma;
int ret = 1;
+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
- anon_vma = page_lock_anon_vma(page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(page);
if (!anon_vma)
goto out;
+ anon_vma_lock_write(anon_vma);
+
ret = 0;
if (!PageCompound(page))
goto out_unlock;
@@ -1491,7 +1842,8 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page));
out_unlock:
- page_unlock_anon_vma(anon_vma);
+ anon_vma_unlock(anon_vma);
+ put_anon_vma(anon_vma);
out:
return ret;
}
@@ -1983,7 +2335,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
- anon_vma_lock(vma->anon_vma);
+ anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
@@ -2336,19 +2688,65 @@ static int khugepaged(void *none)
return 0;
}
-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ put_huge_zero_page();
+}
+
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
{
struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
@@ -2356,6 +2754,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
BUG_ON(pmd_trans_huge(*pmd));
}
+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
@@ -2370,7 +2778,7 @@ static void split_huge_page_address(struct mm_struct *mm,
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, address, pmd);
}
void __vma_adjust_trans_huge(struct vm_area_struct *vma,