From 107437febd495a50e2cd09c81bbaa84d30e57b07 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 29 Apr 2014 15:36:15 -0400 Subject: mm/numa: Remove BUG_ON() in __handle_mm_fault() Changing PTEs and PMDs to pte_numa & pmd_numa is done with the mmap_sem held for reading, which means a pmd can be instantiated and turned into a numa one while __handle_mm_fault() is examining the value of old_pmd. If that happens, __handle_mm_fault() should just return and let the page fault retry, instead of throwing an oops. This is handled by the test for pmd_trans_huge(*pmd) below. Signed-off-by: Rik van Riel Reviewed-by: Naoya Horiguchi Reported-by: Sunil Pandey Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: linux-mm@kvack.org Cc: lwoodman@redhat.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20140429153615.2d72098e@annuminas.surriel.com Signed-off-by: Ingo Molnar --- mm/memory.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index d0f0bef3be4..9c2dc659f6f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3900,9 +3900,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could -- cgit v1.2.3-70-g09d2 From c46a7c817e662a820373bb76b88d0ad67d6abe5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:06:30 -0700 Subject: x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels _PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman Cc: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/pgtable.h | 6 ++++ arch/x86/include/asm/pgtable.h | 15 +++++--- arch/x86/include/asm/pgtable_64.h | 8 +++++ arch/x86/include/asm/pgtable_types.h | 66 +++++++++++++++++++----------------- arch/x86/mm/pageattr-test.c | 2 +- include/asm-generic/pgtable.h | 8 +++-- include/linux/swapops.h | 2 +- mm/memory.c | 17 ++++------ 8 files changed, 75 insertions(+), 49 deletions(-) (limited to 'mm/memory.c') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3ebb188c3ff..d98c1ecc326 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); } +#define pte_present_nonuma pte_present_nonuma +static inline int pte_present_nonuma(pte_t pte) +{ + return pte_val(pte) & (_PAGE_PRESENT); +} + #define pte_numa pte_numa static inline int pte_numa(pte_t pte) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b459ddf27d6..66276c1d23b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - return pte_flags(pte) & _PAGE_SPECIAL; + return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == + (_PAGE_PRESENT|_PAGE_SPECIAL); } static inline unsigned long pte_pfn(pte_t pte) @@ -452,6 +453,12 @@ static inline int pte_present(pte_t a) _PAGE_NUMA); } +#define pte_present_nonuma pte_present_nonuma +static inline int pte_present_nonuma(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { @@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e22c1dbf7fe..6d6ecd09883 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#ifdef CONFIG_NUMA_BALANCING +/* Automatic NUMA balancing needs to be distinguishable from swap entries */ +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) +#else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#endif #else +#ifdef CONFIG_NUMA_BALANCING +#error Incompatible format for automatic NUMA balancing +#endif #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index eb3d4494513..f216963760e 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -16,15 +16,26 @@ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFTW1 9 /* available for programmer */ +#define _PAGE_BIT_SOFTW2 10 /* " */ +#define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ +#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* + * Swap offsets on configurations that allow automatic NUMA balancing use the + * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from + * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the + * maximum possible swap space from 16TB to 8TB. + */ +#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -40,7 +51,7 @@ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) @@ -61,14 +72,27 @@ * they do not conflict with each other. */ -#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN - #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page + * that is not present. The hinting fault gathers numa placement statistics + * (see pte_numa()). The bit is always zero when the PTE is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + */ +#ifdef CONFIG_NUMA_BALANCING +#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) +#else +#define _PAGE_NUMA (_AT(pteval_t, 0)) +#endif + /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict @@ -94,26 +118,6 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -/* - * _PAGE_NUMA indicates that this page will trigger a numa hinting - * minor page fault to gather numa placement statistics (see - * pte_numa()). The bit picked (8) is within the range between - * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't - * require changes to the swp entry format because that bit is always - * zero when the pte is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - * - * Because we shared the same bit (8) with _PAGE_PROTNONE this can be - * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE - * couldn't reach, like handle_mm_fault() (see access_error in - * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for - * handle_mm_fault() to be invoked). - */ -#define _PAGE_NUMA _PAGE_PROTNONE - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ @@ -122,8 +126,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + _PAGE_SOFT_DIRTY | _PAGE_NUMA) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 461bc828902..6629f397b46 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -35,7 +35,7 @@ enum { static int pte_testbit(pte_t pte) { - return pte_flags(pte) & _PAGE_UNUSED1; + return pte_flags(pte) & _PAGE_SOFTW1; } struct split_state { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a8015a7a55b..53b2acc3821 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) # define pte_accessible(mm, pte) ((void)(pte), 1) #endif +#ifndef pte_present_nonuma +#define pte_present_nonuma(pte) pte_present(pte) +#endif + #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd) static inline int pte_numa(pte_t pte) { return (pte_flags(pte) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; + (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; } #endif @@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte) static inline int pmd_numa(pmd_t pmd) { return (pmd_flags(pmd) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; + (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; } #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c0f75261a72..6adfb7bfbf4 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); } #endif diff --git a/mm/memory.c b/mm/memory.c index e302ae1dcce..0897830011f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -756,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) + if (likely(!pte_special(pte) || pte_numa(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -782,14 +782,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - if (is_zero_pfn(pfn)) - return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } + if (is_zero_pfn(pfn)) + return NULL; + /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. @@ -1722,13 +1723,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; -- cgit v1.2.3-70-g09d2 From 4bbd4c776a63a063546552de42f6a535395f6d9e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:10 -0700 Subject: mm: move get_user_pages()-related code to separate file mm/memory.c is overloaded: over 4k lines. get_user_pages() code is pretty much self-contained let's move it to separate file. No other changes made. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/gup.c | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/internal.h | 5 + mm/memory.c | 641 --------------------------------------------------------- 4 files changed, 655 insertions(+), 642 deletions(-) create mode 100644 mm/gup.c (limited to 'mm/memory.c') diff --git a/mm/Makefile b/mm/Makefile index 0173940407f..4064f3ec145 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 00000000000..ea88b65f264 --- /dev/null +++ b/mm/gup.c @@ -0,0 +1,649 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + goto out; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto no_page_table; + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } + goto out; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + goto split_fallthrough; + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + goto out; + } + } else + spin_unlock(ptl); + /* fall through */ + } +split_fallthrough: + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } + + if (flags & FOLL_GET) + get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. + */ + mlock_vma_page(page); + unlock_page(page); + } + } +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return page; + +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return page; +} + +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return stack_guard_page_start(vma, addr) || + stack_guard_page_end(vma, addr+PAGE_SIZE); +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + long i; + unsigned long vm_flags; + unsigned int page_mask; + + if (!nr_pages) + return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + i = 0; + + do { + struct vm_area_struct *vma; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + unsigned long pg = start & PAGE_MASK; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + goto efault; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + goto efault; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + goto efault; + } + vma = get_gate_vma(mm); + if (pages) { + struct page *page; + + page = vm_normal_page(vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && + is_zero_pfn(pte_pfn(*pte))) + page = pte_page(*pte); + else { + pte_unmap(pte); + goto efault; + } + } + pages[i] = page; + get_page(page); + } + pte_unmap(pte); + page_mask = 0; + goto next_page; + } + + if (!vma) + goto efault; + vm_flags = vma->vm_flags; + if (vm_flags & (VM_IO | VM_PFNMAP)) + goto efault; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * We used to let the write,force case do COW + * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so + * ptrace could set a breakpoint in a read-only + * mapping of an executable, without corrupting + * the file (yet only when that file had been + * opened for writing!). Anon pages in shared + * mappings are surprising: now just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + goto efault; + } + } + } else { + if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * Is there actually any vma we can reach here + * which does not have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + goto efault; + } + } + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, gup_flags); + continue; + } + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + + cond_resched(); + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { + int ret; + unsigned int fault_flags = 0; + + /* For mlock, just skip the stack guard page. */ + if (foll_flags & FOLL_MLOCK) { + if (stack_guard_page(vma, start)) + goto next_page; + } + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); + + ret = handle_mm_fault(mm, vma, start, + fault_flags); + + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) + goto efault; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return i; + } + + /* + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) + foll_flags &= ~FOLL_WRITE; + + cond_resched(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); + return i; +efault: + return i ? : -EFAULT; +} +EXPORT_SYMBOL(__get_user_pages); + +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + vm_flags_t vm_flags; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* + * get_user_pages() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) +{ + int flags = FOLL_TOUCH; + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); +} +EXPORT_SYMBOL(get_user_pages); + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ diff --git a/mm/internal.h b/mm/internal.h index 07b67361a40..6ee580d69dd 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -169,6 +169,11 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); diff --git a/mm/memory.c b/mm/memory.c index 0897830011f..7049d394fa0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -1458,642 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: - if (unlikely(pmd_bad(*pmd))) - goto no_page_table; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte) || pte_file(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto split_fallthrough; - } - if ((flags & FOLL_NUMA) && pte_numa(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); - } - - if (flags & FOLL_GET) - get_page_foll(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - /* - * pte_mkyoung() would be more correct here, but atomic care - * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). - */ - mark_page_accessed(page); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return page; - -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - -no_page: - pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) - return page; - -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; -} - -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - -/** - * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * __get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * __get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If - * the page is written to, set_page_dirty (or set_page_dirty_lock, as - * appropriate) must be called after the page is finished with, and - * before put_page is called. - * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. - * - * In most cases, get_user_pages or get_user_pages_fast should be used - * instead of __get_user_pages. __get_user_pages should be used only if - * you need some special @gup_flags. - */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - long i; - unsigned long vm_flags; - unsigned int page_mask; - - if (!nr_pages) - return 0; - - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - - /* - * If FOLL_FORCE is set then do not force a full fault as the hinting - * fault information is unrelated to the reference behaviour of a task - * using the address space - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - - i = 0; - - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - goto efault; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) - goto efault; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - goto efault; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - goto efault; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); - page_mask = 0; - goto next_page; - } - - if (!vma) - goto efault; - vm_flags = vma->vm_flags; - if (vm_flags & (VM_IO | VM_PFNMAP)) - goto efault; - - if (gup_flags & FOLL_WRITE) { - if (!(vm_flags & VM_WRITE)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * We used to let the write,force case do COW - * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so - * ptrace could set a breakpoint in a read-only - * mapping of an executable, without corrupting - * the file (yet only when that file had been - * opened for writing!). Anon pages in shared - * mappings are surprising: now just reject it. - */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); - goto efault; - } - } - } else { - if (!(vm_flags & VM_READ)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * Is there actually any vma we can reach here - * which does not have VM_MAYREAD set? - */ - if (!(vm_flags & VM_MAYREAD)) - goto efault; - } - } - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; - } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - goto efault; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; - return i; - } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } -next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); - } while (nr_pages); - return i; -efault: - return i ? : -EFAULT; -} -EXPORT_SYMBOL(__get_user_pages); - -/* - * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * - * This is meant to be called in the specific scenario where for locking reasons - * we try to access user memory in atomic context (within a pagefault_disable() - * section), this returns -EFAULT, and we want to resolve the user fault before - * trying again. - * - * Typically this is meant to be used by the futex code. - * - * The main difference with get_user_pages() is that this function will - * unconditionally call handle_mm_fault() which will in turn perform all the - * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. - * - * This is important for some architectures where those bits also gate the - * access permission to the page because they are maintained in software. On - * such architectures, gup() will not be enough to make a subsequent access - * succeed. - * - * This should be called with the mm_sem held for read. - */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) -{ - struct vm_area_struct *vma; - vm_flags_t vm_flags; - int ret; - - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) - return -EFAULT; - - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) - return -EFAULT; - - ret = handle_mm_fault(mm, vma, address, fault_flags); - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & VM_FAULT_SIGBUS) - return -EFAULT; - BUG(); - } - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - return 0; -} - -/* - * get_user_pages() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) -{ - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); -} -EXPORT_SYMBOL(get_user_pages); - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ - pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { -- cgit v1.2.3-70-g09d2 From 850e9c69ca75f32aa9361a0edec6cad388a231b0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:45 -0700 Subject: mm: fix typo in comment in do_fault_around() Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7049d394fa0..e7ccbac25b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2832,7 +2832,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, /* * max_pgoff is either end of page table or end of vma - * or fault_around_pages() from pgoff, depending what is neast. + * or fault_around_pages() from pgoff, depending what is nearest. */ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; -- cgit v1.2.3-70-g09d2 From a9b0f8618d46ba027243b8ecb5c2468a7112d235 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:54 -0700 Subject: mm: nominate faultaround area in bytes rather than page order There is evidencs that the faultaround feature is less relevant on architectures with page size bigger then 4k. Which makes sense since page fault overhead per byte of mapped area should be less there. Let's rework the feature to specify faultaround area in bytes instead of page order. It's 64 kilobytes for now. The patch effectively disables faultaround on architectures with page size >= 64k (like ppc64). It's possible that some other size of faultaround area is relevant for a platform. We can expose `fault_around_bytes' variable to arch-specific code once such platforms will be found. Signed-off-by: Kirill A. Shutemov Cc: Rusty Russell Cc: Hugh Dickins Cc: Madhavan Srinivasan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 62 +++++++++++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e7ccbac25b7..62a08a7badc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2758,63 +2758,47 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -#define FAULT_AROUND_ORDER 4 +static unsigned long fault_around_bytes = 65536; + +static inline unsigned long fault_around_pages(void) +{ + return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK; +} -#ifdef CONFIG_DEBUG_FS -static unsigned int fault_around_order = FAULT_AROUND_ORDER; -static int fault_around_order_get(void *data, u64 *val) +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) { - *val = fault_around_order; + *val = fault_around_bytes; return 0; } -static int fault_around_order_set(void *data, u64 val) +static int fault_around_bytes_set(void *data, u64 val) { - BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); - if (1UL << val > PTRS_PER_PTE) + if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; - fault_around_order = val; + fault_around_bytes = val; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, - fault_around_order_get, fault_around_order_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; - ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, - &fault_around_order_fops); + ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); if (!ret) - pr_warn("Failed to create fault_around_order in debugfs"); + pr_warn("Failed to create fault_around_bytes in debugfs"); return 0; } late_initcall(fault_around_debugfs); - -static inline unsigned long fault_around_pages(void) -{ - return 1UL << fault_around_order; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); -} -#else -static inline unsigned long fault_around_pages(void) -{ - unsigned long nr_pages; - - nr_pages = 1UL << FAULT_AROUND_ORDER; - BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); - return nr_pages; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); -} #endif static void do_fault_around(struct vm_area_struct *vma, unsigned long address, @@ -2871,7 +2855,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages) { + if (vma->vm_ops->map_pages && fault_around_pages() > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) -- cgit v1.2.3-70-g09d2 From 1fdb412bd825998efbced3a16f6ce7e0329728cf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:55 -0700 Subject: mm: document do_fault_around() feature Some clarification on how faultaround works. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 62a08a7badc..d67fd9fcf1f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2760,6 +2760,10 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, static unsigned long fault_around_bytes = 65536; +/* + * fault_around_pages() and fault_around_mask() round down fault_around_bytes + * to nearest page order. It's what do_fault_around() expects to see. + */ static inline unsigned long fault_around_pages(void) { return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; @@ -2801,6 +2805,29 @@ static int __init fault_around_debugfs(void) late_initcall(fault_around_debugfs); #endif +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order). This way it's + * easier to guarantee that we don't cross page table boundaries. + */ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { -- cgit v1.2.3-70-g09d2