From ce83d2174ea9c3d72d5821cf3ebc974e36391bf7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:06 -0800 Subject: thp: allocate memory in khugepaged outside of mmap_sem write mode This tries to be more friendly to filesystem in userland, with userland backends that allocate memory in the I/O paths and that could deadlock if khugepaged holds the mmap_sem write mode of the userland backend while allocating memory. Memory allocation may wait for writeback I/O completion from the daemon that may be blocked in the mmap_sem read mode if a page fault happens and the daemon wasn't using mlock for the memory required for the I/O submission and completion. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f6559e7711b..bce6e12140e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, - struct page **hpage) + struct page **hpage, + struct vm_area_struct *vma) { - struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd, _pmd; @@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); + new_page = *hpage; #else VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } #endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); /* * Prevent all access to pagetables with the exception of @@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; -#ifndef CONFIG_NUMA - new_page = *hpage; -#else - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); - if (unlikely(!new_page)) { - *hpage = ERR_PTR(-ENOMEM); - goto out; - } -#endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) - goto out_put_page; - anon_vma_lock(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out_put_page; + goto out; } /* @@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; #endif khugepaged_pages_collapsed++; -out: +out_up_write: up_write(&mm->mmap_sem); return; -out_put_page: +out: #ifdef CONFIG_NUMA put_page(new_page); #endif - goto out; + goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - up_read(&mm->mmap_sem); - collapse_huge_page(mm, address, hpage); - } + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); out: return ret; } -- cgit v1.2.3-70-g09d2