summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-02-02 14:38:15 +0900
committerTejun Heo <tj@kernel.org>2010-02-02 14:38:15 +0900
commitab386128f20c44c458a90039ab1bdc265ac474c9 (patch)
tree2ad188744922b1bb951fd10ff50dc04c83acce22 /mm
parentdbfc196a3cc1a2514ad0737a82f764de23bd65e6 (diff)
parentab658321f32770b903a4426e2a6fae0392757755 (diff)
Merge branch 'master' into percpu
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c100
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/memcontrol.c11
-rw-r--r--mm/nommu.c119
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c3
11 files changed, 192 insertions, 103 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0eb6c..e3736923220 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *,struct page*),
- void *data)
+ void *data,
+ gfp_t gfp)
{
struct page *page;
int err;
repeat:
page = find_get_page(mapping, index);
if (!page) {
- page = page_cache_alloc_cold(mapping);
+ page = __page_cache_alloc(gfp | __GFP_COLD);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
return page;
}
-/**
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *,struct page*),
- void *data)
+ void *data,
+ gfp_t gfp)
+
{
struct page *page;
int err;
retry:
- page = __read_cache_page(mapping, index, filler, data);
+ page = __read_cache_page(mapping, index, filler, data, gfp);
if (IS_ERR(page))
return page;
if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
mark_page_accessed(page);
return page;
}
+
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
EXPORT_SYMBOL(read_cache_page_async);
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+ pgoff_t index,
+ gfp_t gfp)
+{
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+
+ return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
+
/**
* read_cache_page - read into page cache, fill it if needed
* @mapping: the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
int (*filler)(void *,struct page*),
void *data)
{
- struct page *page;
-
- page = read_cache_page_async(mapping, index, filler, data);
- if (IS_ERR(page))
- goto out;
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- page_cache_release(page);
- page = ERR_PTR(-EIO);
- }
- out:
- return page;
+ return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
}
EXPORT_SYMBOL(read_cache_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c21820..e91b81b6367 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
{
int i;
- if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+ if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, sz);
return;
}
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25..4e348dbaecd 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
* Safely read from address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long probe_kernel_read(void *dst, void *src, size_t size)
+
+long __weak probe_kernel_read(void *dst, void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
+long __probe_kernel_read(void *dst, void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, void *src, size_t size)
+ __attribute__((alias("__probe_kernel_write")));
+
+long __probe_kernel_write(void *dst, void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 488b644e0e8..954032b80be 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
if (free_all)
goto try_to_free;
move_account:
- while (mem->res.usage > 0) {
+ do {
ret = -EBUSY;
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
@@ -2614,8 +2614,8 @@ move_account:
if (ret == -ENOMEM)
goto try_to_free;
cond_resched();
- }
- ret = 0;
+ /* "ret" should also be checked to ensure all lists are empty. */
+ } while (mem->res.usage > 0 || ret);
out:
css_put(&mem->css);
return ret;
@@ -2648,10 +2648,7 @@ try_to_free:
}
lru_add_drain();
/* try move_account...there may be some *locked* pages. */
- if (mem->res.usage)
- goto move_account;
- ret = 0;
- goto out;
+ goto move_account;
}
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/nommu.c b/mm/nommu.c
index 6f9248f89bd..48a2ecfaf05 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Ok, looks good - let it rip.
*/
+ flush_icache_range(mm->brk, brk);
return mm->brk = brk;
}
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
static void __put_nommu_region(struct vm_region *region)
__releases(nommu_region_sem)
{
- kenter("%p{%d}", region, atomic_read(&region->vm_usage));
+ kenter("%p{%d}", region, region->vm_usage);
BUG_ON(!nommu_region_tree.rb_node);
- if (atomic_dec_and_test(&region->vm_usage)) {
+ if (--region->vm_usage == 0) {
if (region->vm_top > region->vm_start)
delete_nommu_region(region);
up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
if (!vma)
goto error_getting_vma;
- atomic_set(&region->vm_usage, 1);
+ region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
}
/* we've found a region we can share */
- atomic_inc(&pregion->vm_usage);
+ pregion->vm_usage++;
vma->vm_region = pregion;
start = pregion->vm_start;
start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_region = NULL;
vma->vm_start = 0;
vma->vm_end = 0;
- atomic_dec(&pregion->vm_usage);
+ pregion->vm_usage--;
pregion = NULL;
goto error_just_free;
}
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
share:
add_vma_to_mm(current->mm, vma);
- up_write(&nommu_region_sem);
+ /* we flush the region from the icache only when the first executable
+ * mapping of it is made */
+ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+ flush_icache_range(region->vm_start, region->vm_end);
+ region->vm_icache_flushed = true;
+ }
- if (prot & PROT_EXEC)
- flush_icache_range(result, result + len);
+ up_write(&nommu_region_sem);
kleave(" = %lx", result);
return result;
@@ -1436,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
kenter("");
- /* we're only permitted to split anonymous regions that have a single
- * owner */
- if (vma->vm_file ||
- atomic_read(&vma->vm_region->vm_usage) != 1)
+ /* we're only permitted to split anonymous regions (these should have
+ * only a single usage on the region) */
+ if (vma->vm_file)
return -ENOMEM;
if (mm->map_count >= sysctl_max_map_count)
@@ -1513,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
/* cut the backing region down to size */
region = vma->vm_region;
- BUG_ON(atomic_read(&region->vm_usage) != 1);
+ BUG_ON(region->vm_usage != 1);
down_write(&nommu_region_sem);
delete_nommu_region(region);
@@ -1757,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
- * ask for an unmapped area at which to create a mapping on a file
- */
-unsigned long get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
- unsigned long, unsigned long);
-
- get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
-
- if (!get_area)
- return -ENOSYS;
-
- return get_area(file, addr, len, pgoff, flags);
-}
-EXPORT_SYMBOL(get_unmapped_area);
-
-/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
@@ -1916,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
/* only read or write mappings where it is permitted */
if (write && vma->vm_flags & VM_MAYWRITE)
- len -= copy_to_user((void *) addr, buf, len);
+ copy_to_user_page(vma, NULL, addr,
+ (void *) addr, buf, len);
else if (!write && vma->vm_flags & VM_MAYREAD)
- len -= copy_from_user(buf, (void *) addr, len);
+ copy_from_user_page(vma, NULL, addr,
+ buf, (void *) addr, len);
else
len = 0;
} else {
@@ -1929,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
mmput(mm);
return len;
}
+
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+ size_t newsize)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ struct vm_region *region;
+ pgoff_t low, high;
+ size_t r_size, r_top;
+
+ low = newsize >> PAGE_SHIFT;
+ high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ down_write(&nommu_region_sem);
+
+ /* search for VMAs that fall within the dead zone */
+ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+ low, high) {
+ /* found one - only interested if it's shared out of the page
+ * cache */
+ if (vma->vm_flags & VM_SHARED) {
+ up_write(&nommu_region_sem);
+ return -ETXTBSY; /* not quite true, but near enough */
+ }
+ }
+
+ /* reduce any regions that overlap the dead zone - if in existence,
+ * these will be pointed to by VMAs that don't overlap the dead zone
+ *
+ * we don't check for any regions that start beyond the EOF as there
+ * shouldn't be any
+ */
+ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+ 0, ULONG_MAX) {
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ region = vma->vm_region;
+ r_size = region->vm_top - region->vm_start;
+ r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+
+ if (r_top > newsize) {
+ region->vm_top -= r_top - newsize;
+ if (region->vm_end > region->vm_top)
+ region->vm_end = region->vm_top;
+ }
+ }
+
+ up_write(&nommu_region_sem);
+ return 0;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6849e870de5..9a7aaae07ab 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -556,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- __free_one_page(page, zone, 0, migratetype);
- trace_mm_page_pcpu_drain(page, 0, migratetype);
+ /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+ __free_one_page(page, zone, 0, page_private(page));
+ trace_mm_page_pcpu_drain(page, 0, page_private(page));
} while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
@@ -1219,10 +1220,10 @@ again:
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -3937,7 +3938,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
}
/* Merge backward if suitable */
- if (start_pfn < early_node_map[i].end_pfn &&
+ if (start_pfn < early_node_map[i].start_pfn &&
end_pfn >= early_node_map[i].start_pfn) {
early_node_map[i].start_pfn = start_pfn;
return;
diff --git a/mm/percpu.c b/mm/percpu.c
index 626e43c9949..b336638d20e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work)
*/
void free_percpu(void *ptr)
{
- void *addr = __pcpu_ptr_to_addr(ptr);
+ void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
@@ -1277,6 +1277,8 @@ void free_percpu(void *ptr)
if (!ptr)
return;
+ addr = __pcpu_ptr_to_addr(ptr);
+
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee2268..e87e3724482 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
*/
void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
{
- if (new < old) {
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * unmap_mapping_range is called twice, first simply for
- * efficiency so that truncate_inode_pages does fewer
- * single-page unmaps. However after this first call, and
- * before truncate_inode_pages finishes, it is possible for
- * private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second
- * unmap_mapping_range call must be made for correctness.
- */
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, new);
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- }
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, new);
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f92..834db7be240 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -220,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
}
EXPORT_SYMBOL(strndup_user);
-#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f25..d55d905463e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -555,10 +555,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
}
rcu_read_unlock();
- if (nr) {
- BUG_ON(nr > atomic_read(&vmap_lazy_nr));
+ if (nr)
atomic_sub(nr, &vmap_lazy_nr);
- }
if (nr || force_flush)
flush_tlb_kernel_range(*start, *end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b..c26986c85ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
if (!populated_zone(zone))
continue;
+ if (zone_is_all_unreclaimable(zone))
+ continue;
+
if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
0, 0))
return 1;