From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:31:39 -0700 Subject: mm: clean up get_user_pages_fast() documentation Move more documentation for get_user_pages_fast into the new kerneldoc comment. Add some comments for get_user_pages as well. Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't there initially because it was once a static inline function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Cc: Andy Grover Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778..891bad0613f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @len: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If len is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) -- cgit v1.2.3-70-g09d2 From f8ad0f499fad5cdbcaaa2d97542b2db869b5a770 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:33 -0700 Subject: mm: introduce follow_pte() A generic readonly page table lookup helper to map an address space and an address from it to a pte. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 891bad0613f..7135d6b2599 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3103,6 +3103,43 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ +static int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + if (!ptep) + goto out; + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} + #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, -- cgit v1.2.3-70-g09d2 From 03668a4debf4f50de55c34b6e66dae63e1c73716 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:34 -0700 Subject: mm: use generic follow_pte() in follow_phys() Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 36 +++++------------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7135d6b2599..24ff20480bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3145,50 +3145,24 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; + int ret = -EINVAL; pte_t *ptep, pte; spinlock_t *ptl; - resource_size_t phys_addr = 0; - struct mm_struct *mm = vma->vm_mm; - int ret = -EINVAL; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!ptep) - goto out; - pte = *ptep; - if (!pte_present(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - phys_addr = pte_pfn(pte); - phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ *prot = pgprot_val(pte_pgprot(pte)); - *phys = phys_addr; - ret = 0; + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: -- cgit v1.2.3-70-g09d2 From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:35 -0700 Subject: mm: introduce follow_pfn() Analoguous to follow_phys(), add a helper that looks up the PFN at a user virtual address in an IO mapping or a raw PFN mapping. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'mm/memory.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b548e7cfbd..c80dbd4a62c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -792,6 +792,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 24ff20480bb..d5d1653d60a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3140,6 +3140,35 @@ out: return -EINVAL; } +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, -- cgit v1.2.3-70-g09d2 From 30c9f3a9fae79517bca595826a19c6855fbb6d32 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 10 Apr 2009 08:43:11 -0700 Subject: Remove internal use of 'write_access' in mm/memory.c The fault handling routines really want more fine-grained flags than a single "was it a write fault" boolean - the callers will want to set flags like "you can return a retry error" etc. And that's actually how the VM works internally, but right now the top-level fault handling functions in mm/memory.c all pass just the 'write_access' boolean around. This switches them over to pass around the FAULT_FLAG_xyzzy 'flags' variable instead. The 'write_access' calling convention still exists for the exported 'handle_mm_fault()' function, but that is next. Signed-off-by: Linus Torvalds --- mm/memory.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index d5d1653d60a..e6a9700359d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2496,7 +2496,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; struct page *page; @@ -2572,9 +2572,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - write_access = 0; + flags &= ~FAULT_FLAG_WRITE; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); @@ -2587,7 +2587,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, try_to_free_swap(page); unlock_page(page); - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; @@ -2616,7 +2616,7 @@ out_page: */ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) + unsigned int flags) { struct page *page; spinlock_t *ptl; @@ -2776,7 +2776,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * due to the bad i386 page protection. But it's valid * for other architectures too. * - * Note that if write_access is true, we either now have + * Note that if FAULT_FLAG_WRITE is set, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. @@ -2847,11 +2847,10 @@ unwritable_page: static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); pte_unmap(page_table); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); @@ -2868,12 +2867,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { - unsigned int flags = FAULT_FLAG_NONLINEAR | - (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; + flags |= FAULT_FLAG_NONLINEAR; + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; @@ -2904,7 +2903,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, int write_access) + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -2915,30 +2914,30 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); + pte, pmd, flags); } if (pte_file(entry)) return do_nonlinear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { + if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, entry); } else { /* @@ -2947,7 +2946,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (write_access) + if (flags & FAULT_FLAG_WRITE) flush_tlb_page(vma, address); } unlock: @@ -2965,13 +2964,14 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pmd_t *pmd; pte_t *pte; + unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0; __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, write_access); + return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); @@ -2984,7 +2984,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) return VM_FAULT_OOM; - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); } #ifndef __PAGETABLE_PUD_FOLDED -- cgit v1.2.3-70-g09d2 From d06063cc221fdefcab86589e79ddfdb7c0e14b63 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 10 Apr 2009 09:01:23 -0700 Subject: Move FAULT_FLAG_xyz into handle_mm_fault() callers This allows the callers to now pass down the full set of FAULT_FLAG_xyz flags to handle_mm_fault(). All callers have been (mechanically) converted to the new calling convention, there's almost certainly room for architectures to clean up their code and then add FAULT_FLAG_RETRY when that support is added. Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/avr32/mm/fault.c | 2 +- arch/cris/mm/fault.c | 2 +- arch/frv/mm/fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m32r/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/mn10300/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/powerpc/platforms/cell/spu_fault.c | 2 +- arch/s390/lib/uaccess_pt.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/mm/fault_32.c | 2 +- arch/sh/mm/tlbflush_64.c | 2 +- arch/sparc/mm/fault_32.c | 4 ++-- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- include/linux/mm.h | 4 ++-- mm/memory.c | 8 ++++---- 25 files changed, 30 insertions(+), 30 deletions(-) (limited to 'mm/memory.c') diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 4829f96585b..00a31deaa96 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -146,7 +146,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(mm, vma, address, cause > 0); + fault = handle_mm_fault(mm, vma, address, cause > 0 ? FAULT_FLAG_WRITE : 0); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 0455557a289..6fdcbb70982 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -208,7 +208,7 @@ good_area: * than endlessly redo the fault. */ survive: - fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11)); + fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index 62d4abbaa65..b61d86d3deb 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -133,7 +133,7 @@ good_area: * fault. */ survive: - fault = handle_mm_fault(mm, vma, address, writeaccess); + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index c4c76db90f9..f925115e325 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -163,7 +163,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, * the fault. */ - fault = handle_mm_fault(mm, vma, address, writeaccess & 1); + fault = handle_mm_fault(mm, vma, address, (writeaccess & 1) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 05093d41d98..30f5d100a81 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -163,7 +163,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, ear0, write); + fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 23088bed111..19261a99e62 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -154,7 +154,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0); + fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { /* * We ran out of memory, or some other thing happened diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 4a71df4c1b3..7274b47f4c2 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -196,7 +196,7 @@ survive: */ addr = (address & PAGE_MASK); set_thread_fault_code(error_code); - fault = handle_mm_fault(mm, vma, addr, write); + fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index f493f03231d..d0e35cf99fc 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -155,7 +155,7 @@ good_area: */ survive: - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); #ifdef DEBUG printk("handle_mm_fault returns %d\n",fault); #endif diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 5e67cd1fab4..956607a63f4 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -232,7 +232,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, is_write); + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 55767ad9f00..6751ce9ede9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -102,7 +102,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 33cf25025da..a62e1e138bc 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -258,7 +258,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 92c7fa4ecc3..bfb6dd6ab38 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -202,7 +202,7 @@ good_area: * fault. */ - fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0); + fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { /* * We hit a shared mapping outside of the file, or some diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5beffc8f481..830bef0a113 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -302,7 +302,7 @@ good_area: * the fault. */ survive: - ret = handle_mm_fault(mm, vma, address, is_write); + ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c index 95d8dadf2d8..d06ba87f1a1 100644 --- a/arch/powerpc/platforms/cell/spu_fault.c +++ b/arch/powerpc/platforms/cell/spu_fault.c @@ -70,7 +70,7 @@ int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea, } ret = 0; - *flt = handle_mm_fault(mm, vma, ea, is_write); + *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index b0b84c35b0a..cb5d59eab0e 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -66,7 +66,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address, } survive: - fault = handle_mm_fault(mm, vma, address, write_access); + fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 220a152c836..74eb26bf197 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -352,7 +352,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { up_read(&mm->mmap_sem); diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index 2c50f80fc33..cc8ddbdf3d7 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -133,7 +133,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, writeaccess); + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 7876997ba19..fcbb6e135ce 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -187,7 +187,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, writeaccess); + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 12e447fc854..a5e30c642ee 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -241,7 +241,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -484,7 +484,7 @@ good_area: if(!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(mm, vma, address, write)) { + switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 4ab8993b086..e5620b27c8b 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -398,7 +398,7 @@ good_area: goto bad_area; } - fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE)); + fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 7384d8accfe..637c6505dc0 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -65,7 +65,7 @@ good_area: do { int fault; - fault = handle_mm_fault(mm, vma, address, is_write); + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c403526d5d1..78a5fff857b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1113,7 +1113,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index bdd860d93f7..bc0733359a8 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -106,7 +106,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, is_write); + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/include/linux/mm.h b/include/linux/mm.h index cf260d848eb..d006e93d5c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -810,11 +810,11 @@ extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); #ifdef CONFIG_MMU extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access); + unsigned long address, unsigned int flags); #else static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - int write_access) + unsigned int flags) { /* should never happen if there's no MMU */ BUG(); diff --git a/mm/memory.c b/mm/memory.c index e6a9700359d..98bcb90d595 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1310,8 +1310,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; - ret = handle_mm_fault(mm, vma, start, - foll_flags & FOLL_WRITE); + + /* FOLL_WRITE matches FAULT_FLAG_WRITE! */ + ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; @@ -2958,13 +2959,12 @@ unlock: * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0; __set_current_state(TASK_RUNNING); -- cgit v1.2.3-70-g09d2 From d26ed650d9947a786bbda8de9cd914dbeebc1a68 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 23 Jun 2009 13:52:49 +0100 Subject: mm: don't rely on flags coincidence Indeed FOLL_WRITE matches FAULT_FLAG_WRITE, matches GUP_FLAGS_WRITE, and it's tempting to devise a set of Grand Unified Paging flags; but not today. So until then, let's rely upon the compiler to spot the coincidence, "rather than have that subtle dependency and a comment for it" - as you remarked in another context yesterday. Signed-off-by: Hugh Dickins Acked-by: Wu Fengguang Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 98bcb90d595..50da9511aa7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1311,8 +1311,10 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, while (!(page = follow_page(vma, start, foll_flags))) { int ret; - /* FOLL_WRITE matches FAULT_FLAG_WRITE! */ - ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); + ret = handle_mm_fault(mm, vma, start, + (foll_flags & FOLL_WRITE) ? + FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; -- cgit v1.2.3-70-g09d2 From a5c9b696ec109bb54d547fdb437a7a0c2d514670 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 23 Jun 2009 12:36:58 -0700 Subject: mm: pass mm to grab_swap_token If a kthread happens to use get_user_pages() on an mm (as KSM does), there's a chance that it will end up trying to read in a swap page, then oops in grab_swap_token() because the kthread has no mm: GUP passes down the right mm, so grab_swap_token() ought to be using it. We have not identified a stronger case than KSM's daemon (not yet in mainline), but the issue must have come up before, since RHEL has included a fix for this for years (though a different fix, they just back out of grab_swap_token if current->mm is unset: which is what we first proposed, but using the right mm here seems more correct). Reported-by: Izik Eidus Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 12 ++++++------ mm/memory.c | 2 +- mm/thrash.c | 32 +++++++++++++++----------------- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index c88b36665f7..7c15334f3ff 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -298,8 +298,8 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ -extern struct mm_struct * swap_token_mm; -extern void grab_swap_token(void); +extern struct mm_struct *swap_token_mm; +extern void grab_swap_token(struct mm_struct *); extern void __put_swap_token(struct mm_struct *); static inline int has_swap_token(struct mm_struct *mm) @@ -419,10 +419,10 @@ static inline swp_entry_t get_swap_page(void) } /* linux/mm/thrash.c */ -#define put_swap_token(x) do { } while(0) -#define grab_swap_token() do { } while(0) -#define has_swap_token(x) 0 -#define disable_swap_token() do { } while(0) +#define put_swap_token(mm) do { } while (0) +#define grab_swap_token(mm) do { } while (0) +#define has_swap_token(mm) 0 +#define disable_swap_token() do { } while (0) static inline void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) diff --git a/mm/memory.c b/mm/memory.c index 50da9511aa7..f46ac18ba23 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2519,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205a9c3..2372d4ed5dd 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; static unsigned int global_faults; -void grab_swap_token(void) +void grab_swap_token(struct mm_struct *mm) { int current_interval; global_faults++; - current_interval = global_faults - current->mm->faultstamp; + current_interval = global_faults - mm->faultstamp; if (!spin_trylock(&swap_token_lock)) return; /* First come first served */ if (swap_token_mm == NULL) { - current->mm->token_priority = current->mm->token_priority + 2; - swap_token_mm = current->mm; + mm->token_priority = mm->token_priority + 2; + swap_token_mm = mm; goto out; } - if (current->mm != swap_token_mm) { - if (current_interval < current->mm->last_interval) - current->mm->token_priority++; + if (mm != swap_token_mm) { + if (current_interval < mm->last_interval) + mm->token_priority++; else { - if (likely(current->mm->token_priority > 0)) - current->mm->token_priority--; + if (likely(mm->token_priority > 0)) + mm->token_priority--; } /* Check if we deserve the token */ - if (current->mm->token_priority > - swap_token_mm->token_priority) { - current->mm->token_priority += 2; - swap_token_mm = current->mm; + if (mm->token_priority > swap_token_mm->token_priority) { + mm->token_priority += 2; + swap_token_mm = mm; } } else { /* Token holder came in again! */ - current->mm->token_priority += 2; + mm->token_priority += 2; } out: - current->mm->faultstamp = global_faults; - current->mm->last_interval = current_interval; + mm->faultstamp = global_faults; + mm->last_interval = current_interval; spin_unlock(&swap_token_lock); -return; } /* Called on process exit. */ -- cgit v1.2.3-70-g09d2