summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-01-13 15:47:06 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 17:32:45 -0800
commitce83d2174ea9c3d72d5821cf3ebc974e36391bf7 (patch)
tree5e31167f84110551cf00ed3335b2cc3af317b33a
parent0bbbc0b33d141f78a0d9218a54a47f50621220d3 (diff)
thp: allocate memory in khugepaged outside of mmap_sem write mode
This tries to be more friendly to filesystem in userland, with userland backends that allocate memory in the I/O paths and that could deadlock if khugepaged holds the mmap_sem write mode of the userland backend while allocating memory. Memory allocation may wait for writeback I/O completion from the daemon that may be blocked in the mmap_sem read mode if a page fault happens and the daemon wasn't using mlock for the memory required for the I/O submission and completion. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/huge_memory.c56
1 files changed, 34 insertions, 22 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f6559e7711b..bce6e12140e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
- struct page **hpage)
+ struct page **hpage,
+ struct vm_area_struct *vma)
{
- struct vm_area_struct *vma;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd, _pmd;
@@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifndef CONFIG_NUMA
VM_BUG_ON(!*hpage);
+ new_page = *hpage;
#else
VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ if (unlikely(!new_page)) {
+ up_read(&mm->mmap_sem);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
#endif
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ put_page(new_page);
+ return;
+ }
+
+ /* after allocating the hugepage upgrade to mmap_sem write mode */
+ up_read(&mm->mmap_sem);
/*
* Prevent all access to pagetables with the exception of
@@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
goto out;
-#ifndef CONFIG_NUMA
- new_page = *hpage;
-#else
- new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
- if (unlikely(!new_page)) {
- *hpage = ERR_PTR(-ENOMEM);
- goto out;
- }
-#endif
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
- goto out_put_page;
-
anon_vma_lock(vma->anon_vma);
pte = pte_offset_map(pmd, address);
@@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_unlock(&mm->page_table_lock);
anon_vma_unlock(vma->anon_vma);
mem_cgroup_uncharge_page(new_page);
- goto out_put_page;
+ goto out;
}
/*
@@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm,
*hpage = NULL;
#endif
khugepaged_pages_collapsed++;
-out:
+out_up_write:
up_write(&mm->mmap_sem);
return;
-out_put_page:
+out:
#ifdef CONFIG_NUMA
put_page(new_page);
#endif
- goto out;
+ goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- up_read(&mm->mmap_sem);
- collapse_huge_page(mm, address, hpage);
- }
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma);
out:
return ret;
}