summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c221
1 files changed, 192 insertions, 29 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 247b5c312b9..160f5b503ea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,9 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/delayacct.h>
#include <linux/init.h>
+#include <linux/writeback.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -126,7 +128,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
pmd_clear(pmd);
pte_lock_deinit(page);
pte_free_tlb(tlb, page);
- dec_page_state(nr_page_table_pages);
+ dec_zone_page_state(page, NR_PAGETABLE);
tlb->mm->nr_ptes--;
}
@@ -311,7 +313,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
pte_free(new);
} else {
mm->nr_ptes++;
- inc_page_state(nr_page_table_pages);
+ inc_zone_page_state(new, NR_PAGETABLE);
pmd_populate(mm, pmd, new);
}
spin_unlock(&mm->page_table_lock);
@@ -503,7 +505,7 @@ again:
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
- spin_lock(src_ptl);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
do {
/*
@@ -1225,7 +1227,12 @@ out:
return retval;
}
-/*
+/**
+ * vm_insert_page - insert single page into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @page: source kernel page
+ *
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
@@ -1317,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/* Note: this is only safe if the mm semaphore is held when called. */
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
@@ -1457,14 +1473,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse, ret = VM_FAULT_MINOR;
+ int reuse = 0, ret = VM_FAULT_MINOR;
+ struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page)
goto gotten;
- if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
- (VM_SHARED|VM_WRITE))) {
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page)) {
+ if (!TestSetPageLocked(old_page)) {
+ reuse = can_share_swap_page(old_page);
+ unlock_page(old_page);
+ }
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
/*
* Notify the address space that the page is about to
@@ -1493,13 +1524,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_same(*page_table, orig_pte))
goto unlock;
}
-
+ dirty_page = old_page;
+ get_page(dirty_page);
reuse = 1;
- } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
- reuse = can_share_swap_page(old_page);
- unlock_page(old_page);
- } else {
- reuse = 0;
}
if (reuse) {
@@ -1549,9 +1576,16 @@ gotten:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- ptep_establish(vma, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
+ /*
+ * Clear the pte entry and flush it first, before updating the
+ * pte with the new entry. This will avoid a race condition
+ * seen in the presence of one thread doing SMC and another
+ * thread doing COW.
+ */
+ ptep_clear_flush(vma, address, page_table);
+ set_pte_at(mm, address, page_table, entry);
+ update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
@@ -1565,6 +1599,10 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
+ if (dirty_page) {
+ set_page_dirty_balance(dirty_page);
+ put_page(dirty_page);
+ }
return ret;
oom:
if (old_page)
@@ -1784,9 +1822,10 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
*
* NOTE! We have to be ready to update the memory sharing
* between the file and the memory map for a potential last
@@ -1853,13 +1892,18 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
return 0;
}
-EXPORT_SYMBOL(vmtruncate_range);
+EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
-/*
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @addr: address to start
+ * @vma: user vma this addresses belong to
+ *
* Primitive swap readahead code. We simply read an aligned block of
* (1 << page_cluster) entries in the swap area. This method is chosen
* because it doesn't cost us any seek time. We also make sure to queue
- * the 'original' request together with the readahead ones...
+ * the 'original' request together with the readahead ones...
*
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
@@ -1934,6 +1978,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
migration_entry_wait(mm, pmd, address);
goto out;
}
+ delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
@@ -1946,15 +1991,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
- inc_page_state(pgmajfault);
+ count_vm_event(PGMAJFAULT);
grab_swap_token();
}
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
mark_page_accessed(page);
lock_page(page);
@@ -2094,6 +2141,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
+ struct page *dirty_page = NULL;
pte_unmap(page_table);
BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2188,6 +2236,10 @@ retry:
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
+ if (write_access) {
+ dirty_page = new_page;
+ get_page(dirty_page);
+ }
}
} else {
/* One of our sibling threads was faster, back out. */
@@ -2200,6 +2252,10 @@ retry:
lazy_mmu_prot_update(entry);
unlock:
pte_unmap_unlock(page_table, ptl);
+ if (dirty_page) {
+ set_page_dirty_balance(dirty_page);
+ put_page(dirty_page);
+ }
return ret;
oom:
page_cache_release(new_page);
@@ -2207,6 +2263,54 @@ oom:
}
/*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
+{
+ spinlock_t *ptl;
+ pte_t entry;
+ unsigned long pfn;
+ int ret = VM_FAULT_MINOR;
+
+ pte_unmap(page_table);
+ BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+ BUG_ON(is_cow_mapping(vma->vm_flags));
+
+ pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+ if (pfn == NOPFN_OOM)
+ return VM_FAULT_OOM;
+ if (pfn == NOPFN_SIGBUS)
+ return VM_FAULT_SIGBUS;
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ /* Only go through if we didn't race with anybody else... */
+ if (pte_none(*page_table)) {
+ entry = pfn_pte(pfn, vma->vm_page_prot);
+ if (write_access)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ set_pte_at(mm, address, page_table, entry);
+ }
+ pte_unmap_unlock(page_table, ptl);
+ return ret;
+}
+
+/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
@@ -2268,11 +2372,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
old_entry = entry = *pte;
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, address,
- pte, pmd, write_access);
- return do_no_page(mm, vma, address,
- pte, pmd, write_access);
+ if (vma->vm_ops) {
+ if (vma->vm_ops->nopage)
+ return do_no_page(mm, vma, address,
+ pte, pmd,
+ write_access);
+ if (unlikely(vma->vm_ops->nopfn))
+ return do_no_pfn(mm, vma, address, pte,
+ pmd, write_access);
+ }
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
}
if (pte_file(entry))
return do_file_page(mm, vma, address,
@@ -2324,7 +2434,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
__set_current_state(TASK_RUNNING);
- inc_page_state(pgfault);
+ count_vm_event(PGFAULT);
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, write_access);
@@ -2501,3 +2611,56 @@ int in_gate_area_no_task(unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *old_buf = buf;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ /* ignore errors, just check how much was sucessfully transfered */
+ while (len) {
+ int bytes, ret, offset;
+ void *maddr;
+
+ ret = get_user_pages(tsk, mm, addr, 1,
+ write, 1, &page, &vma);
+ if (ret <= 0)
+ break;
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
+ len -= bytes;
+ buf += bytes;
+ addr += bytes;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ return buf - old_buf;
+}