diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2011-01-13 15:47:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 17:32:45 -0800 |
commit | ce83d2174ea9c3d72d5821cf3ebc974e36391bf7 (patch) | |
tree | 5e31167f84110551cf00ed3335b2cc3af317b33a | |
parent | 0bbbc0b33d141f78a0d9218a54a47f50621220d3 (diff) |
thp: allocate memory in khugepaged outside of mmap_sem write mode
This tries to be more friendly to filesystem in userland, with userland
backends that allocate memory in the I/O paths and that could deadlock if
khugepaged holds the mmap_sem write mode of the userland backend while
allocating memory. Memory allocation may wait for writeback I/O
completion from the daemon that may be blocked in the mmap_sem read mode
if a page fault happens and the daemon wasn't using mlock for the memory
required for the I/O submission and completion.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/huge_memory.c | 56 |
1 files changed, 34 insertions, 22 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f6559e7711b..bce6e12140e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, - struct page **hpage) + struct page **hpage, + struct vm_area_struct *vma) { - struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd, _pmd; @@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); + new_page = *hpage; #else VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } #endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); /* * Prevent all access to pagetables with the exception of @@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; -#ifndef CONFIG_NUMA - new_page = *hpage; -#else - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); - if (unlikely(!new_page)) { - *hpage = ERR_PTR(-ENOMEM); - goto out; - } -#endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) - goto out_put_page; - anon_vma_lock(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out_put_page; + goto out; } /* @@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; #endif khugepaged_pages_collapsed++; -out: +out_up_write: up_write(&mm->mmap_sem); return; -out_put_page: +out: #ifdef CONFIG_NUMA put_page(new_page); #endif - goto out; + goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - up_read(&mm->mmap_sem); - collapse_huge_page(mm, address, hpage); - } + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); out: return ret; } |