From d10e63f29488b0f312a443f9507ea9b6fd3c9090 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:31 +0200 Subject: mm: numa: Create basic numa page hinting infrastructure Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/huge_mm.h') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b31cb7da034..a1d26a98c65 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } + +extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, { return 0; } + +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp) +{ +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.2.3-70-g09d2 From 4daae3b4b9e49b7e0935499a352f1c59d90287d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 11:33:45 +0000 Subject: mm: mempolicy: Use _PAGE_NUMA to migrate pages Note: Based on "mm/mpol: Use special PROT_NONE to migrate pages" but sufficiently different that the signed-off-bys were dropped Combine our previous _PAGE_NUMA, mpol_misplaced and migrate_misplaced_page() pieces into an effective migrate on fault scheme. Note that (on x86) we rely on PROT_NONE pages being !present and avoid the TLB flush from try_to_unmap(TTU_MIGRATION). This greatly improves the page-migration performance. Based-on-work-by: Peter Zijlstra Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 9 +++++---- mm/huge_memory.c | 31 ++++++++++++++++++++++++++++--- mm/memory.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 12 deletions(-) (limited to 'include/linux/huge_mm.h') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1d26a98c65..dabb5108d6c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -160,8 +160,8 @@ static inline struct page *compound_trans_head(struct page *page) return page; } -extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp); +extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) @@ -200,9 +200,10 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, return 0; } -static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp) +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5f37630c54..5723b551c02 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1019,17 +1020,39 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page; + struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; + int target_nid; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); + get_page(page); + spin_unlock(&mm->page_table_lock); + + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) + goto clear_pmdnuma; + + /* + * Due to lacking code to migrate thp pages, we'll split + * (which preserves the special PROT_NONE) and re-take the + * fault on the normal pages. + */ + split_huge_page(page); + put_page(page); + return 0; + +clear_pmdnuma: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); @@ -1037,6 +1060,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, out_unlock: spin_unlock(&mm->page_table_lock); + if (page) + put_page(page); return 0; } diff --git a/mm/memory.c b/mm/memory.c index e30616f2cc3..d52542680e1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -3451,8 +3452,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { - struct page *page; + struct page *page = NULL; spinlock_t *ptl; + int current_nid, target_nid; /* * The "pte" at this point cannot be used safely without @@ -3465,8 +3467,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); - if (unlikely(!pte_same(*ptep, pte))) - goto out_unlock; + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + pte = pte_mknonnuma(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3477,8 +3482,25 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } -out_unlock: + get_page(page); + current_nid = page_to_nid(page); + target_nid = mpol_misplaced(page, vma, addr); pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + /* + * Account for the fault against the current node if it not + * being replaced regardless of where the page is located. + */ + current_nid = numa_node_id(); + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, target_nid)) + current_nid = target_nid; + +out: return 0; } @@ -3655,7 +3677,7 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { if (pmd_numa(*pmd)) - return do_huge_pmd_numa_page(mm, address, + return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { -- cgit v1.2.3-70-g09d2 From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Implement change_prot_numa() in terms of change_protection() This patch converts change_prot_numa() to use change_protection(). As pte_numa and friends check the PTE bits directly it is necessary for change_protection() to use pmd_mknuma(). Hence the required modifications to change_protection() are a little clumsy but the end result is that most of the numa page table helpers are just one or two instructions. Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 3 +- include/linux/mm.h | 4 +- mm/huge_memory.c | 14 ++++- mm/mempolicy.c | 137 +++++------------------------------------------- mm/mprotect.c | 72 +++++++++++++++++++------ 5 files changed, 85 insertions(+), 145 deletions(-) (limited to 'include/linux/huge_mm.h') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index dabb5108d6c..027ad04ef3a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -27,7 +27,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot); + unsigned long addr, pgprot_t newprot, + int prot_numa); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/include/linux/mm.h b/include/linux/mm.h index 471185e29ba..d04c2f0aab3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1080,7 +1080,7 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long flags, unsigned long new_addr); extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable); + int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1552,7 +1552,7 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) #endif #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE -void change_prot_numa(struct vm_area_struct *vma, +unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5723b551c02..d79f7a55bf6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1147,7 +1147,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1155,7 +1155,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 51d3ebd8561..75d4600a5e9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -568,134 +568,23 @@ static inline int check_pgd_range(struct vm_area_struct *vma, #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* - * Here we search for not shared page mappings (mapcount == 1) and we - * set up the pmd/pte_numa on those mappings so the very next access - * will fire a NUMA hinting page fault. + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. */ -static int -change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, *_pte; - struct page *page; - unsigned long _address, end; - spinlock_t *ptl; - int ret = 0; - - VM_BUG_ON(address & ~PAGE_MASK); - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto out; - - if (pmd_trans_huge_lock(pmd, vma) == 1) { - int page_nid; - ret = HPAGE_PMD_NR; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page = pmd_page(*pmd); - - /* only check non-shared pages */ - if (page_mapcount(page) != 1) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page_nid = page_to_nid(page); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - ret += HPAGE_PMD_NR; - /* defer TLB flush to lower the overhead */ - spin_unlock(&mm->page_table_lock); - goto out; - } - - if (pmd_trans_unstable(pmd)) - goto out; - VM_BUG_ON(!pmd_present(*pmd)); - - end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _address < end; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (!pte_present(pteval)) - continue; - if (pte_numa(pteval)) - continue; - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (page_mapcount(page) != 1) - continue; - - set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); - - /* defer TLB flush to lower the overhead */ - ret++; - } - pte_unmap_unlock(pte, ptl); - - if (ret && !pmd_numa(*pmd)) { - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); - /* defer TLB flush to lower the overhead */ - } - -out: - return ret; -} - -/* Assumes mmap_sem is held */ -void -change_prot_numa(struct vm_area_struct *vma, - unsigned long address, unsigned long end) +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; - int progress = 0; - - while (address < end) { - VM_BUG_ON(address < vma->vm_start || - address + PAGE_SIZE > vma->vm_end); + int nr_updated; + BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); - progress += change_prot_numa_range(mm, vma, address); - address = (address + PMD_SIZE) & PMD_MASK; - } + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); - /* - * Flush the TLB for the mm to start the NUMA hinting - * page faults after we finish scanning this vma part - * if there were any PTE updates - */ - if (progress) { - mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); - flush_tlb_range(vma, address, end); - mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); - } + return nr_updated; } #else static unsigned long change_prot_numa(struct vm_area_struct *vma, diff --git a/mm/mprotect.c b/mm/mprotect.c index 7c3628a8b48..7ef6ae964e8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,10 +35,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -49,19 +50,39 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + if (!prot_numa) { + ptent = pte_modify(ptent, newprot); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page) { + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { + ptent = pte_mknuma(ptent); + updated = true; + } + } + } /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent)) { ptent = pte_mkwrite(ptent); + updated = true; + } + + if (updated) + pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); - pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -83,9 +104,25 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, return pages; } +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -97,7 +134,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -105,8 +142,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + + if (prot_numa) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); return pages; @@ -114,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -126,7 +166,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); return pages; @@ -134,7 +174,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -150,7 +190,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -162,7 +202,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; unsigned long pages; @@ -171,7 +211,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); mmu_notifier_invalidate_range_end(mm, start, end); return pages; @@ -249,7 +289,7 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); -- cgit v1.2.3-70-g09d2 From 4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:50 +0000 Subject: mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable rmap_walk_anon() and try_to_unmap_anon() appears to be too careful about locking the anon vma: while it needs protection against anon vma list modifications, it does not need exclusive access to the list itself. Transforming this exclusive lock to a read-locked rwsem removes a global lock from the hot path of page-migration intense threaded workloads which can cause pathological performance like this: 96.43% process 0 [kernel.kallsyms] [k] perf_trace_sched_switch | --- perf_trace_sched_switch __schedule schedule schedule_preempt_disabled __mutex_lock_common.isra.6 __mutex_lock_slowpath mutex_lock | |--50.61%-- rmap_walk | move_to_new_page | migrate_pages | migrate_misplaced_page | __do_numa_page.isra.69 | handle_pte_fault | handle_mm_fault | __do_page_fault | do_page_fault | page_fault | __memset_sse2 | | | --100.00%-- worker_thread | | | --100.00%-- start_thread | --49.39%-- page_lock_anon_vma try_to_unmap_anon try_to_unmap migrate_pages migrate_misplaced_page __do_numa_page.isra.69 handle_pte_fault handle_mm_fault __do_page_fault do_page_fault page_fault __memset_sse2 | --100.00%-- worker_thread start_thread With this change applied the profile is now nicely flat and there's no anon-vma related scheduling/blocking. Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(), to make it clearer that it's an exclusive write-lock in that case - suggested by Rik van Riel. Suggested-by: Linus Torvalds Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Rik van Riel Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 2 +- include/linux/rmap.h | 17 ++++++++++++++--- mm/huge_memory.c | 6 +++--- mm/ksm.c | 6 +++--- mm/memory-failure.c | 4 ++-- mm/migrate.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/rmap.c | 48 ++++++++++++++++++++++++------------------------ 9 files changed, 50 insertions(+), 39 deletions(-) (limited to 'include/linux/huge_mm.h') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 027ad04ef3a..0d1208c0bdc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -102,7 +102,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #define wait_split_huge_page(__anon_vma, __pmd) \ do { \ pmd_t *____pmd = (__pmd); \ - anon_vma_lock(__anon_vma); \ + anon_vma_lock_write(__anon_vma); \ anon_vma_unlock(__anon_vma); \ BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index f3f41d242e2..c20635c527a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -118,7 +118,7 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) up_write(&anon_vma->root->rwsem); } -static inline void anon_vma_lock(struct anon_vma *anon_vma) +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); } @@ -128,6 +128,17 @@ static inline void anon_vma_unlock(struct anon_vma *anon_vma) up_write(&anon_vma->root->rwsem); } +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + + /* * anon_vma helper functions. */ @@ -220,8 +231,8 @@ int try_to_munlock(struct page *); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); +struct anon_vma *page_lock_anon_vma_read(struct page *page); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index acd37fe55eb..a24c9cb9c83 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page) int ret = 1; BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) goto out; ret = 0; @@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); out: return ret; } @@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); diff --git a/mm/ksm.c b/mm/ksm.c index ae539f0b8aa..7fa37de1ee0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1634,7 +1634,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1688,7 +1688,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1741,7 +1741,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ddb68a169e4..f2cd830f66c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* diff --git a/mm/migrate.c b/mm/migrate.c index f24e9cc49cc..6e46485f014 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -754,7 +754,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (PageAnon(page)) { /* - * Only page_lock_anon_vma() understands the subtleties of + * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); diff --git a/mm/mmap.c b/mm/mmap.c index 88408632da6..68a16b40c20 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -602,7 +602,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) anon_vma_interval_tree_pre_update_vma(next); diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307..3dabd170753 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } if (vma->anon_vma) { anon_vma = vma->anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); } } diff --git a/mm/rmap.c b/mm/rmap.c index 6e3ee3b8279..b0f612df7b9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma() such that + * Synchronize against page_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * mutex_trylock() from page_lock_anon_vma(). This orders: + * down_read_trylock() from page_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma() VS put_anon_vma() - * mutex_trylock() atomic_dec_and_test() + * page_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() * LOCK MB - * atomic_read() mutex_is_locked() + * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ if (rwsem_is_locked(&anon_vma->root->rwsem)) { - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_unlock(anon_vma); } @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read() * and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma_unlock(anon_vma); @@ -442,7 +442,7 @@ out: * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (down_write_trylock(&root_anon_vma->rwsem)) { + if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - up_write(&root_anon_vma->rwsem); + up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because - * we'll deadlock on the anon_vma_lock() recursion. + * we'll deadlock on the anon_vma_lock_write() recursion. */ - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } @@ -504,9 +504,9 @@ out: return anon_vma; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } /* @@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page, struct anon_vma_chain *avc; int referenced = 0; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return referenced; @@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page, break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return referenced; } @@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return ret; @@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return ret; } @@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, int ret = SWAP_AGAIN; /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing @@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); return ret; } -- cgit v1.2.3-70-g09d2