summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c215
1 files changed, 191 insertions, 24 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 109e9866237..9cf3f341a28 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/init.h>
+#include <linux/writeback.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -466,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_cow_mapping(vm_flags)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = *src_pte;
+ pte = pte_wrprotect(pte);
}
/*
@@ -505,6 +506,7 @@ again:
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
do {
/*
@@ -526,6 +528,7 @@ again:
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -627,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
int anon_rss = 0;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
@@ -689,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
- pte_clear_full(mm, addr, pte, tlb->fullmm);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
add_mm_rss(mm, file_rss, anon_rss);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return addr;
@@ -1108,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
do {
struct page *page = ZERO_PAGE(addr);
pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1117,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1226,7 +1233,12 @@ out:
return retval;
}
-/*
+/**
+ * vm_insert_page - insert single page into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @page: source kernel page
+ *
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
@@ -1269,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1318,7 +1332,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/* Note: this is only safe if the mm semaphore is held when called. */
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
@@ -1458,14 +1481,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse, ret = VM_FAULT_MINOR;
+ int reuse = 0, ret = VM_FAULT_MINOR;
+ struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page)
goto gotten;
- if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
- (VM_SHARED|VM_WRITE))) {
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page)) {
+ if (!TestSetPageLocked(old_page)) {
+ reuse = can_share_swap_page(old_page);
+ unlock_page(old_page);
+ }
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
/*
* Notify the address space that the page is about to
@@ -1494,13 +1532,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_same(*page_table, orig_pte))
goto unlock;
}
-
+ dirty_page = old_page;
+ get_page(dirty_page);
reuse = 1;
- } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
- reuse = can_share_swap_page(old_page);
- unlock_page(old_page);
- } else {
- reuse = 0;
}
if (reuse) {
@@ -1551,7 +1585,14 @@ gotten:
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
lazy_mmu_prot_update(entry);
- ptep_establish(vma, address, page_table, entry);
+ /*
+ * Clear the pte entry and flush it first, before updating the
+ * pte with the new entry. This will avoid a race condition
+ * seen in the presence of one thread doing SMC and another
+ * thread doing COW.
+ */
+ ptep_clear_flush(vma, address, page_table);
+ set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
@@ -1566,6 +1607,10 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
+ if (dirty_page) {
+ set_page_dirty_balance(dirty_page);
+ put_page(dirty_page);
+ }
return ret;
oom:
if (old_page)
@@ -1785,9 +1830,10 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
*
* NOTE! We have to be ready to update the memory sharing
* between the file and the memory map for a potential last
@@ -1856,11 +1902,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
}
EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
-/*
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @addr: address to start
+ * @vma: user vma this addresses belong to
+ *
* Primitive swap readahead code. We simply read an aligned block of
* (1 << page_cluster) entries in the swap area. This method is chosen
* because it doesn't cost us any seek time. We also make sure to queue
- * the 'original' request together with the readahead ones...
+ * the 'original' request together with the readahead ones...
*
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
@@ -2098,6 +2149,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
+ struct page *dirty_page = NULL;
pte_unmap(page_table);
BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2244,10 @@ retry:
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
+ if (write_access) {
+ dirty_page = new_page;
+ get_page(dirty_page);
+ }
}
} else {
/* One of our sibling threads was faster, back out. */
@@ -2204,6 +2260,10 @@ retry:
lazy_mmu_prot_update(entry);
unlock:
pte_unmap_unlock(page_table, ptl);
+ if (dirty_page) {
+ set_page_dirty_balance(dirty_page);
+ put_page(dirty_page);
+ }
return ret;
oom:
page_cache_release(new_page);
@@ -2211,6 +2271,54 @@ oom:
}
/*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
+{
+ spinlock_t *ptl;
+ pte_t entry;
+ unsigned long pfn;
+ int ret = VM_FAULT_MINOR;
+
+ pte_unmap(page_table);
+ BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+ BUG_ON(is_cow_mapping(vma->vm_flags));
+
+ pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+ if (pfn == NOPFN_OOM)
+ return VM_FAULT_OOM;
+ if (pfn == NOPFN_SIGBUS)
+ return VM_FAULT_SIGBUS;
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ /* Only go through if we didn't race with anybody else... */
+ if (pte_none(*page_table)) {
+ entry = pfn_pte(pfn, vma->vm_page_prot);
+ if (write_access)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ set_pte_at(mm, address, page_table, entry);
+ }
+ pte_unmap_unlock(page_table, ptl);
+ return ret;
+}
+
+/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
@@ -2272,11 +2380,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
old_entry = entry = *pte;
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, address,
- pte, pmd, write_access);
- return do_no_page(mm, vma, address,
- pte, pmd, write_access);
+ if (vma->vm_ops) {
+ if (vma->vm_ops->nopage)
+ return do_no_page(mm, vma, address,
+ pte, pmd,
+ write_access);
+ if (unlikely(vma->vm_ops->nopfn))
+ return do_no_pfn(mm, vma, address, pte,
+ pmd, write_access);
+ }
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
}
if (pte_file(entry))
return do_file_page(mm, vma, address,
@@ -2505,3 +2619,56 @@ int in_gate_area_no_task(unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *old_buf = buf;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ /* ignore errors, just check how much was sucessfully transfered */
+ while (len) {
+ int bytes, ret, offset;
+ void *maddr;
+
+ ret = get_user_pages(tsk, mm, addr, 1,
+ write, 1, &page, &vma);
+ if (ret <= 0)
+ break;
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
+ len -= bytes;
+ buf += bytes;
+ addr += bytes;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ return buf - old_buf;
+}