summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDmitry Torokhov <dtor@insightbb.com>2006-09-19 01:56:44 -0400
committerDmitry Torokhov <dtor@insightbb.com>2006-09-19 01:56:44 -0400
commit0612ec48762bf8712db1925b2e67246d2237ebab (patch)
tree01b0d69c9c9915015c0f23ad4263646dd5413e99 /mm
parent4263cf0fac28122c8381b6f4f9441a43cd93c81f (diff)
parent47a5c6fa0e204a2b63309c648bb2fde36836c826 (diff)
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c4
-rw-r--r--mm/fadvise.c15
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c44
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c9
-rw-r--r--mm/mmap.c17
-rw-r--r--mm/mmzone.c6
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/slab.c78
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmstat.c152
16 files changed, 277 insertions, 111 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d213feded10..50353e0dac1 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -29,9 +29,7 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-EXPORT_SYMBOL(max_pfn); /* This is exported so
- * dma_get_required_mask(), which uses
- * it, can be an inline function */
+EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
static LIST_HEAD(bdata_list);
#ifdef CONFIG_CRASH_DUMP
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0a03357a1f8..168c78a121b 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -23,18 +23,6 @@
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
- *
- * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
- * offsets `offset' and `offset+len' inclusive. Any pages which are currently
- * under writeout are skipped, whether or not they are dirty.
- *
- * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
- * offsets `offset' and `offset+len'.
- *
- * By combining these two operations the application may do several things:
- *
- * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
- *
*/
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
@@ -85,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
file->f_ra.ra_pages = bdi->ra_pages * 2;
break;
case POSIX_FADV_WILLNEED:
- case POSIX_FADV_NOREUSE:
if (!mapping->a_ops->readpage) {
ret = -EINVAL;
break;
@@ -106,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
if (ret > 0)
ret = 0;
break;
+ case POSIX_FADV_NOREUSE:
+ break;
case POSIX_FADV_DONTNEED:
if (!bdi_write_congested(mapping->backing_dev_info))
filemap_flush(mapping);
diff --git a/mm/filemap.c b/mm/filemap.c
index d087fc3d328..b9a60c43b61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp,
return;
ra->ra_pages /= 4;
- printk(KERN_WARNING "Reducing readahead size to %luK\n",
- ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index c1e14c9e67e..109e9866237 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,6 +47,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/delayacct.h>
#include <linux/init.h>
#include <asm/pgalloc.h>
@@ -1549,9 +1550,9 @@ gotten:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ lazy_mmu_prot_update(entry);
ptep_establish(vma, address, page_table, entry);
update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
@@ -1853,7 +1854,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
return 0;
}
-EXPORT_SYMBOL(vmtruncate_range);
+EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
/*
* Primitive swap readahead code. We simply read an aligned block of
@@ -1934,6 +1935,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
migration_entry_wait(mm, pmd, address);
goto out;
}
+ delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
@@ -1946,6 +1948,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
}
@@ -1955,6 +1958,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
grab_swap_token();
}
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
mark_page_accessed(page);
lock_page(page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 01c9fb97c61..c37319542b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
int nr_pages = PAGES_PER_SECTION;
int ret;
+ if (pfn_valid(phys_start_pfn))
+ return -EEXIST;
+
ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
if (ret < 0)
@@ -76,15 +79,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
{
unsigned long i;
int err = 0;
+ int start_sec, end_sec;
+ /* during initialize mem_map, align hot-added range to section */
+ start_sec = pfn_to_section_nr(phys_start_pfn);
+ end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
- for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
- err = __add_section(zone, phys_start_pfn + i);
+ for (i = start_sec; i <= end_sec; i++) {
+ err = __add_section(zone, i << PFN_SECTION_SHIFT);
- /* We want to keep adding the rest of the
- * sections if the first ones already exist
+ /*
+ * EEXIST is finally dealed with by ioresource collision
+ * check. see add_memory() => register_memory_resource()
+ * Warning will be printed if there is collision.
*/
if (err && (err != -EEXIST))
break;
+ err = 0;
}
return err;
@@ -156,7 +166,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
res.flags = IORESOURCE_MEM; /* we just need system ram */
section_end = res.end;
- while (find_next_system_ram(&res) >= 0) {
+ while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
nr_pages = (unsigned long)
((res.end + 1 - res.start) >> PAGE_SHIFT);
@@ -213,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
/* add this memory to iomem resource */
-static void register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size)
{
struct resource *res;
-
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
BUG_ON(!res);
@@ -228,7 +237,18 @@ static void register_memory_resource(u64 start, u64 size)
printk("System RAM resource %llx - %llx cannot be added\n",
(unsigned long long)res->start, (unsigned long long)res->end);
kfree(res);
+ res = NULL;
}
+ return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+ if (!res)
+ return;
+ release_resource(res);
+ kfree(res);
+ return;
}
@@ -237,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size)
{
pg_data_t *pgdat = NULL;
int new_pgdat = 0;
+ struct resource *res;
int ret;
+ res = register_memory_resource(start, size);
+ if (!res)
+ return -EEXIST;
+
if (!node_online(nid)) {
pgdat = hotadd_new_pgdat(nid, start);
if (!pgdat)
@@ -268,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size)
BUG_ON(ret);
}
- /* register this memory as resource */
- register_memory_resource(start, size);
-
return ret;
error:
/* rollback pgdat allocation and others */
if (new_pgdat)
rollback_node_hotadd(nid, pgdat);
+ if (res)
+ release_memory_resource(res);
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e07e27e846a..a9963ceddd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
if (vma) {
unsigned long off;
- off = vma->vm_pgoff;
+ /*
+ * for small pages, there is no difference between
+ * shift and PAGE_SHIFT, so the bit-shift is safe.
+ * for huge pages, since vm_pgoff is in units of small
+ * pages, we need to shift off the always 0 bits to get
+ * a useful offset.
+ */
+ BUG_ON(shift < PAGE_SHIFT);
+ off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, vma, off);
} else
diff --git a/mm/mempool.c b/mm/mempool.c
index fe6e05289cc..ccd8cb8cd41 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
smp_mb();
- if (!pool->curr_nr)
- io_schedule();
+ if (!pool->curr_nr) {
+ /*
+ * FIXME: this should be io_schedule(). The timeout is there
+ * as a workaround for some DM problems in 2.6.18.
+ */
+ io_schedule_timeout(5*HZ);
+ }
finish_wait(&pool->wait, &wait);
goto repeat_alloc;
diff --git a/mm/mmap.c b/mm/mmap.c
index c1868ecdbc5..e66a0b524af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,10 @@
#include <asm/cacheflush.h>
#include <asm/tlb.h>
+#ifndef arch_mmap_check
+#define arch_mmap_check(addr, len, flags) (0)
+#endif
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -913,6 +917,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
if (!len)
return -EINVAL;
+ error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len || len > TASK_SIZE)
@@ -1859,6 +1867,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
unsigned long flags;
struct rb_node ** rb_link, * rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
+ int error;
len = PAGE_ALIGN(len);
if (!len)
@@ -1867,6 +1876,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if ((addr + len) > TASK_SIZE || (addr + len) < addr)
return -EINVAL;
+ flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+ error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
/*
* mlock MCL_FUTURE?
*/
@@ -1907,8 +1922,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (security_vm_enough_memory(len >> PAGE_SHIFT))
return -ENOMEM;
- flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
/* Can we just expand an old private anonymous mapping? */
if (vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL))
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 0959ee1a479..febea1c9816 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,7 +14,7 @@ struct pglist_data *first_online_pgdat(void)
return NODE_DATA(first_online_node);
}
-EXPORT_SYMBOL(first_online_pgdat);
+EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
@@ -24,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
return NULL;
return NODE_DATA(nid);
}
-EXPORT_SYMBOL(next_online_pgdat);
+EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
/*
@@ -45,5 +45,5 @@ struct zone *next_zone(struct zone *zone)
}
return zone;
}
-EXPORT_SYMBOL(next_zone);
+EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
diff --git a/mm/nommu.c b/mm/nommu.c
index 5151c44a825..c576df71e3b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1070,6 +1070,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
return 0;
}
+EXPORT_SYMBOL(remap_pfn_range);
void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
@@ -1090,6 +1091,7 @@ void unmap_mapping_range(struct address_space *mapping,
int even_cows)
{
}
+EXPORT_SYMBOL(unmap_mapping_range);
/*
* Check that a process has enough memory to allocate a new virtual
diff --git a/mm/slab.c b/mm/slab.c
index 85c2e03098a..21ba0603570 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -674,6 +674,37 @@ static struct kmem_cache cache_cache = {
#endif
};
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Slab sometimes uses the kmalloc slabs to store the slab headers
+ * for other slabs "off slab".
+ * The locking for this is tricky in that it nests within the locks
+ * of all other slabs in a few places; to deal with this special
+ * locking we put on-slab caches into a separate lock-class.
+ */
+static struct lock_class_key on_slab_key;
+
+static inline void init_lock_keys(struct cache_sizes *s)
+{
+ int q;
+
+ for (q = 0; q < MAX_NUMNODES; q++) {
+ if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep))
+ continue;
+ lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock,
+ &on_slab_key);
+ }
+}
+
+#else
+static inline void init_lock_keys(struct cache_sizes *s)
+{
+}
+#endif
+
+
+
/* Guard access to the cache-chain. */
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@@ -1021,8 +1052,7 @@ static void drain_alien_cache(struct kmem_cache *cachep,
}
}
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
- int nesting)
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
struct slab *slabp = virt_to_slab(objp);
int nodeid = slabp->nodeid;
@@ -1040,7 +1070,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_NODEFREES(cachep);
if (l3->alien && l3->alien[nodeid]) {
alien = l3->alien[nodeid];
- spin_lock_nested(&alien->lock, nesting);
+ spin_lock(&alien->lock);
if (unlikely(alien->avail == alien->limit)) {
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, alien, nodeid);
@@ -1069,15 +1099,14 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
{
}
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
- int nesting)
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
return 0;
}
#endif
-static int __devinit cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1393,6 +1422,7 @@ void __init kmem_cache_init(void)
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL, NULL);
}
+ init_lock_keys(sizes);
sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
sizes->cs_size,
@@ -1760,8 +1790,6 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
}
#endif
-static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting);
-
/**
* slab_destroy - destroy and release all objects in a slab
* @cachep: cache pointer being destroyed
@@ -1785,17 +1813,8 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
call_rcu(&slab_rcu->head, kmem_rcu_free);
} else {
kmem_freepages(cachep, addr);
- if (OFF_SLAB(cachep)) {
- unsigned long flags;
-
- /*
- * lockdep: we may nest inside an already held
- * ac->lock, so pass in a nesting flag:
- */
- local_irq_save(flags);
- __cache_free(cachep->slabp_cache, slabp, 1);
- local_irq_restore(flags);
- }
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slabp);
}
}
@@ -3100,16 +3119,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
if (slabp->inuse == 0) {
if (l3->free_objects > l3->free_limit) {
l3->free_objects -= cachep->num;
- /*
- * It is safe to drop the lock. The slab is
- * no longer linked to the cache. cachep
- * cannot disappear - we are using it and
- * all destruction of caches must be
- * serialized properly by the user.
- */
- spin_unlock(&l3->list_lock);
slab_destroy(cachep, slabp);
- spin_lock(&l3->list_lock);
} else {
list_add(&slabp->list, &l3->slabs_free);
}
@@ -3135,7 +3145,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
#endif
check_irq_off();
l3 = cachep->nodelists[node];
- spin_lock_nested(&l3->list_lock, SINGLE_DEPTH_NESTING);
+ spin_lock(&l3->list_lock);
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
int max = shared_array->limit - shared_array->avail;
@@ -3178,14 +3188,14 @@ free_done:
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
-static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
- if (cache_free_alien(cachep, objp, nesting))
+ if (cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {
@@ -3214,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
/**
- * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
* @cache: The cache to allocate from.
* @flags: See kmalloc().
*
@@ -3424,7 +3434,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
BUG_ON(virt_to_cache(objp) != cachep);
local_irq_save(flags);
- __cache_free(cachep, objp, 0);
+ __cache_free(cachep, objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3449,7 +3459,7 @@ void kfree(const void *objp)
kfree_debugcheck(objp);
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
- __cache_free(c, (void *)objp, 0);
+ __cache_free(c, (void *)objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
diff --git a/mm/swap.c b/mm/swap.c
index 8fd095c4ae5..687686a61f7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,6 +54,26 @@ void put_page(struct page *page)
}
EXPORT_SYMBOL(put_page);
+/**
+ * put_pages_list(): release a list of pages
+ *
+ * Release a list of pages which are strung together on page.lru. Currently
+ * used by read_cache_pages() and related error recovery code.
+ *
+ * @pages: list of pages threaded on page->lru
+ */
+void put_pages_list(struct list_head *pages)
+{
+ while (!list_empty(pages)) {
+ struct page *victim;
+
+ victim = list_entry(pages->prev, struct page, lru);
+ list_del(&victim->lru);
+ page_cache_release(victim);
+ }
+}
+EXPORT_SYMBOL(put_pages_list);
+
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e70d6c6d6fe..f1f5ec78378 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
if (!(swap_info[i].flags & SWP_WRITEOK))
continue;
+
if (!device) {
spin_unlock(&swap_lock);
return i;
}
- inode = swap_info->swap_file->f_dentry->d_inode;
+ inode = swap_info[i].swap_file->f_dentry->d_inode;
if (S_ISBLK(inode->i_mode) &&
device == MKDEV(imajor(inode), iminor(inode))) {
spin_unlock(&swap_lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index cf1b015df4a..c6ab55ec688 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -68,10 +68,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
return 0;
write_lock_irq(&mapping->tree_lock);
- if (PageDirty(page)) {
- write_unlock_irq(&mapping->tree_lock);
- return 0;
- }
+ if (PageDirty(page))
+ goto failed;
+ if (page_count(page) != 2) /* caller's ref + pagecache ref */
+ goto failed;
BUG_ON(PagePrivate(page));
__remove_from_page_cache(page);
@@ -79,6 +79,9 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
ClearPageUptodate(page);
page_cache_release(page); /* pagecache ref */
return 1;
+failed:
+ write_unlock_irq(&mapping->tree_lock);
+ return 0;
}
/**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7b450798b45..266162d2ba2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -340,7 +340,7 @@ void __vunmap(void *addr, int deallocate_pages)
__free_page(area->pages[i]);
}
- if (area->nr_pages > PAGE_SIZE/sizeof(struct page *))
+ if (area->flags & VM_VPAGES)
vfree(area->pages);
else
kfree(area->pages);
@@ -427,9 +427,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
- if (array_size > PAGE_SIZE)
+ if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
- else
+ area->flags |= VM_VPAGES;
+ } else
pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
area->pages = pages;
if (!area->pages) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 73b83d67bab..c1b5f4106b3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/cpu.h>
void __get_zone_counts(unsigned long *active, unsigned long *inactive,
unsigned long *free, struct pglist_data *pgdat)
@@ -81,6 +82,7 @@ void all_vm_events(unsigned long *ret)
{
sum_vm_events(ret, &cpu_online_map);
}
+EXPORT_SYMBOL_GPL(all_vm_events);
#ifdef CONFIG_HOTPLUG
/*
@@ -113,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-#define STAT_THRESHOLD 32
+static int calculate_threshold(struct zone *zone)
+{
+ int threshold;
+ int mem; /* memory in 128 MB units */
+
+ /*
+ * The threshold scales with the number of processors and the amount
+ * of memory per zone. More memory means that we can defer updates for
+ * longer, more processors could lead to more contention.
+ * fls() is used to have a cheap way of logarithmic scaling.
+ *
+ * Some sample thresholds:
+ *
+ * Threshold Processors (fls) Zonesize fls(mem+1)
+ * ------------------------------------------------------------------
+ * 8 1 1 0.9-1 GB 4
+ * 16 2 2 0.9-1 GB 4
+ * 20 2 2 1-2 GB 5
+ * 24 2 2 2-4 GB 6
+ * 28 2 2 4-8 GB 7
+ * 32 2 2 8-16 GB 8
+ * 4 2 2 <128M 1
+ * 30 4 3 2-4 GB 5
+ * 48 4 3 8-16 GB 8
+ * 32 8 4 1-2 GB 4
+ * 32 8 4 0.9-1GB 4
+ * 10 16 5 <128M 1
+ * 40 16 5 900M 4
+ * 70 64 7 2-4 GB 5
+ * 84 64 7 4-8 GB 6
+ * 108 512 9 4-8 GB 6
+ * 125 1024 10 8-16 GB 8
+ * 125 1024 10 16-32 GB 9
+ */
+
+ mem = zone->present_pages >> (27 - PAGE_SHIFT);
+
+ threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
/*
- * Determine pointer to currently valid differential byte given a zone and
- * the item number.
- *
- * Preemption must be off
+ * Refresh the thresholds for each zone.
*/
-static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+static void refresh_zone_stat_thresholds(void)
{
- return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+ struct zone *zone;
+ int cpu;
+ int threshold;
+
+ for_each_zone(zone) {
+
+ if (!zone->present_pages)
+ continue;
+
+ threshold = calculate_threshold(zone);
+
+ for_each_online_cpu(cpu)
+ zone_pcp(zone, cpu)->stat_threshold = threshold;
+ }
}
/*
@@ -132,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- s8 *p;
+ struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ s8 *p = pcp->vm_stat_diff + item;
long x;
- p = diff_pointer(zone, item);
x = delta + *p;
- if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+ if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
zone_page_state_add(x, zone, item);
x = 0;
}
-
*p = x;
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -171,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
* No overflow check is necessary and therefore the differential can be
* incremented or decremented in place which may allow the compilers to
* generate better code.
- *
* The increment or decrement is known and therefore one boundary check can
* be omitted.
*
+ * NOTE: These functions are very performance sensitive. Change only
+ * with care.
+ *
* Some processors have inc/dec instructions that are atomic vs an interrupt.
* However, the code must first determine the differential location in a zone
* based on the processor number and then inc/dec the counter. There is no
@@ -184,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- s8 *p = diff_pointer(zone, item);
+ struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ s8 *p = pcp->vm_stat_diff + item;
(*p)++;
- if (unlikely(*p > STAT_THRESHOLD)) {
- zone_page_state_add(*p, zone, item);
- *p = 0;
+ if (unlikely(*p > pcp->stat_threshold)) {
+ int overstep = pcp->stat_threshold / 2;
+
+ zone_page_state_add(*p + overstep, zone, item);
+ *p = -overstep;
}
}
@@ -203,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
struct zone *zone = page_zone(page);
- s8 *p = diff_pointer(zone, item);
+ struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ s8 *p = pcp->vm_stat_diff + item;
(*p)--;
- if (unlikely(*p < -STAT_THRESHOLD)) {
- zone_page_state_add(*p, zone, item);
- *p = 0;
+ if (unlikely(*p < - pcp->stat_threshold)) {
+ int overstep = pcp->stat_threshold / 2;
+
+ zone_page_state_add(*p - overstep, zone, item);
+ *p = overstep;
}
}
EXPORT_SYMBOL(__dec_zone_page_state);
@@ -238,19 +302,9 @@ EXPORT_SYMBOL(inc_zone_page_state);
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
unsigned long flags;
- struct zone *zone;
- s8 *p;
- zone = page_zone(page);
local_irq_save(flags);
- p = diff_pointer(zone, item);
-
- (*p)--;
-
- if (unlikely(*p < -STAT_THRESHOLD)) {
- zone_page_state_add(*p, zone, item);
- *p = 0;
- }
+ __dec_zone_page_state(page, item);
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
@@ -524,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
pageset->pcp[j].high,
pageset->pcp[j].batch);
}
+#ifdef CONFIG_SMP
+ seq_printf(m, "\n vm stats threshold: %d",
+ pageset->stat_threshold);
+#endif
}
seq_printf(m,
"\n all_unreclaimable: %u"
@@ -612,3 +670,35 @@ struct seq_operations vmstat_op = {
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the thresholds are recalculated
+ * when necessary.
+ */
+static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ refresh_zone_stat_thresholds();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata vmstat_notifier =
+ { &vmstat_cpuup_callback, NULL, 0 };
+
+int __init setup_vmstat(void)
+{
+ refresh_zone_stat_thresholds();
+ register_cpu_notifier(&vmstat_notifier);
+ return 0;
+}
+module_init(setup_vmstat)
+#endif