diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 15 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 10 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 44 | ||||
-rw-r--r-- | mm/mempolicy.c | 10 | ||||
-rw-r--r-- | mm/mempool.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 17 | ||||
-rw-r--r-- | mm/mmzone.c | 6 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 22 | ||||
-rw-r--r-- | mm/slab.c | 53 | ||||
-rw-r--r-- | mm/swap.c | 20 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/truncate.c | 11 | ||||
-rw-r--r-- | mm/vmalloc.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 27 | ||||
-rw-r--r-- | mm/vmstat.c | 152 |
21 files changed, 328 insertions, 100 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index d213feded10..50353e0dac1 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -29,9 +29,7 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -EXPORT_SYMBOL(max_pfn); /* This is exported so - * dma_get_required_mask(), which uses - * it, can be an inline function */ +EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP diff --git a/mm/fadvise.c b/mm/fadvise.c index 0a03357a1f8..168c78a121b 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -23,18 +23,6 @@ /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. - * - * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file - * offsets `offset' and `offset+len' inclusive. Any pages which are currently - * under writeout are skipped, whether or not they are dirty. - * - * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file - * offsets `offset' and `offset+len'. - * - * By combining these two operations the application may do several things: - * - * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. - * */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { @@ -85,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) file->f_ra.ra_pages = bdi->ra_pages * 2; break; case POSIX_FADV_WILLNEED: - case POSIX_FADV_NOREUSE: if (!mapping->a_ops->readpage) { ret = -EINVAL; break; @@ -106,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (ret > 0) ret = 0; break; + case POSIX_FADV_NOREUSE: + break; case POSIX_FADV_DONTNEED: if (!bdi_write_congested(mapping->backing_dev_info)) filemap_flush(mapping); diff --git a/mm/filemap.c b/mm/filemap.c index d087fc3d328..b9a60c43b61 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp, return; ra->ra_pages /= 4; - printk(KERN_WARNING "Reducing readahead size to %luK\n", - ra->ra_pages << (PAGE_CACHE_SHIFT - 10)); } /** diff --git a/mm/memory.c b/mm/memory.c index 7e2a4b1580e..109e9866237 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -47,6 +47,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/module.h> +#include <linux/delayacct.h> #include <linux/init.h> #include <asm/pgalloc.h> @@ -503,7 +504,7 @@ again: return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); - spin_lock(src_ptl); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); do { /* @@ -1549,9 +1550,9 @@ gotten: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + lazy_mmu_prot_update(entry); ptep_establish(vma, address, page_table, entry); update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); @@ -1853,7 +1854,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return 0; } -EXPORT_SYMBOL(vmtruncate_range); +EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ /* * Primitive swap readahead code. We simply read an aligned block of @@ -1934,6 +1935,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, pmd, address); goto out; } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1946,6 +1948,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } @@ -1955,6 +1958,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, grab_swap_token(); } + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); mark_page_accessed(page); lock_page(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01c9fb97c61..c37319542b7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int ret; + if (pfn_valid(phys_start_pfn)) + return -EEXIST; + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); if (ret < 0) @@ -76,15 +79,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, { unsigned long i; int err = 0; + int start_sec, end_sec; + /* during initialize mem_map, align hot-added range to section */ + start_sec = pfn_to_section_nr(phys_start_pfn); + end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { - err = __add_section(zone, phys_start_pfn + i); + for (i = start_sec; i <= end_sec; i++) { + err = __add_section(zone, i << PFN_SECTION_SHIFT); - /* We want to keep adding the rest of the - * sections if the first ones already exist + /* + * EEXIST is finally dealed with by ioresource collision + * check. see add_memory() => register_memory_resource() + * Warning will be printed if there is collision. */ if (err && (err != -EEXIST)) break; + err = 0; } return err; @@ -156,7 +166,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) res.flags = IORESOURCE_MEM; /* we just need system ram */ section_end = res.end; - while (find_next_system_ram(&res) >= 0) { + while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); nr_pages = (unsigned long) ((res.end + 1 - res.start) >> PAGE_SHIFT); @@ -213,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } /* add this memory to iomem resource */ -static void register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); BUG_ON(!res); @@ -228,7 +237,18 @@ static void register_memory_resource(u64 start, u64 size) printk("System RAM resource %llx - %llx cannot be added\n", (unsigned long long)res->start, (unsigned long long)res->end); kfree(res); + res = NULL; } + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); + return; } @@ -237,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; int new_pgdat = 0; + struct resource *res; int ret; + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + if (!node_online(nid)) { pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) @@ -268,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size) BUG_ON(ret); } - /* register this memory as resource */ - register_memory_resource(start, size); - return ret; error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); + if (res) + release_memory_resource(res); return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e07e27e846a..a9963ceddd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - off = vma->vm_pgoff; + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else diff --git a/mm/mempool.c b/mm/mempool.c index fe6e05289cc..ccd8cb8cd41 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -238,8 +238,13 @@ repeat_alloc: init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); smp_mb(); - if (!pool->curr_nr) - io_schedule(); + if (!pool->curr_nr) { + /* + * FIXME: this should be io_schedule(). The timeout is there + * as a workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + } finish_wait(&pool->wait, &wait); goto repeat_alloc; diff --git a/mm/mmap.c b/mm/mmap.c index c1868ecdbc5..e66a0b524af 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -30,6 +30,10 @@ #include <asm/cacheflush.h> #include <asm/tlb.h> +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -913,6 +917,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, if (!len) return -EINVAL; + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len || len > TASK_SIZE) @@ -1859,6 +1867,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long flags; struct rb_node ** rb_link, * rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; len = PAGE_ALIGN(len); if (!len) @@ -1867,6 +1876,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if ((addr + len) > TASK_SIZE || (addr + len) < addr) return -EINVAL; + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* * mlock MCL_FUTURE? */ @@ -1907,8 +1922,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (security_vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL)) diff --git a/mm/mmzone.c b/mm/mmzone.c index 0959ee1a479..febea1c9816 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -14,7 +14,7 @@ struct pglist_data *first_online_pgdat(void) return NODE_DATA(first_online_node); } -EXPORT_SYMBOL(first_online_pgdat); +EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { @@ -24,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) return NULL; return NODE_DATA(nid); } -EXPORT_SYMBOL(next_online_pgdat); +EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ /* @@ -45,5 +45,5 @@ struct zone *next_zone(struct zone *zone) } return zone; } -EXPORT_SYMBOL(next_zone); +EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ diff --git a/mm/mremap.c b/mm/mremap.c index 1903bdf65e4..7c15cf3373a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -97,7 +97,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_pte = pte_offset_map_nested(new_pmd, new_addr); new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) - spin_lock(new_ptl); + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { diff --git a/mm/nommu.c b/mm/nommu.c index 5151c44a825..c576df71e3b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1070,6 +1070,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; return 0; } +EXPORT_SYMBOL(remap_pfn_range); void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -1090,6 +1091,7 @@ void unmap_mapping_range(struct address_space *mapping, int even_cows) { } +EXPORT_SYMBOL(unmap_mapping_range); /* * Check that a process has enough memory to allocate a new virtual diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d46ed0f1dc0..b9af136e5cf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -225,7 +225,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that * we select a process with CAP_SYS_RAW_IO set). */ -static void __oom_kill_task(task_t *p, const char *message) +static void __oom_kill_task(struct task_struct *p, const char *message) { if (p->pid == 1) { WARN_ON(1); @@ -255,10 +255,10 @@ static void __oom_kill_task(task_t *p, const char *message) force_sig(SIGKILL, p); } -static int oom_kill_task(task_t *p, const char *message) +static int oom_kill_task(struct task_struct *p, const char *message) { struct mm_struct *mm; - task_t * g, * q; + struct task_struct *g, *q; mm = p->mm; @@ -316,7 +316,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { - task_t *p; + struct task_struct *p; unsigned long points = 0; if (printk_ratelimit()) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e792a583f3..54a4f5375bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2005,6 +2005,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; +#ifdef CONFIG_NUMA + zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) + / 100; +#endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); @@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA +int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_unmapped_ratio = (zone->present_pages * + sysctl_min_unmapped_ratio) / 100; + return 0; +} +#endif + /* * lowmem_reserve_ratio_sysctl_handler - just a wrapper around * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() diff --git a/mm/slab.c b/mm/slab.c index 3936af34454..21ba0603570 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -674,6 +674,37 @@ static struct kmem_cache cache_cache = { #endif }; +#ifdef CONFIG_LOCKDEP + +/* + * Slab sometimes uses the kmalloc slabs to store the slab headers + * for other slabs "off slab". + * The locking for this is tricky in that it nests within the locks + * of all other slabs in a few places; to deal with this special + * locking we put on-slab caches into a separate lock-class. + */ +static struct lock_class_key on_slab_key; + +static inline void init_lock_keys(struct cache_sizes *s) +{ + int q; + + for (q = 0; q < MAX_NUMNODES; q++) { + if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) + continue; + lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, + &on_slab_key); + } +} + +#else +static inline void init_lock_keys(struct cache_sizes *s) +{ +} +#endif + + + /* Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; @@ -1075,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) #endif -static int __devinit cpuup_callback(struct notifier_block *nfb, +static int __cpuinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1272,6 +1303,11 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, local_irq_disable(); memcpy(ptr, list, sizeof(struct kmem_list3)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->list_lock); + MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; local_irq_enable(); @@ -1386,6 +1422,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL, NULL); } + init_lock_keys(sizes); sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, @@ -1398,7 +1435,7 @@ void __init kmem_cache_init(void) } /* 4) Replace the bootstrap head arrays */ { - void *ptr; + struct array_cache *ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); @@ -1406,6 +1443,11 @@ void __init kmem_cache_init(void) BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); memcpy(ptr, cpu_cache_get(&cache_cache), sizeof(struct arraycache_init)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->lock); + cache_cache.array[smp_processor_id()] = ptr; local_irq_enable(); @@ -1416,6 +1458,11 @@ void __init kmem_cache_init(void) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), sizeof(struct arraycache_init)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->lock); + malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr; local_irq_enable(); @@ -3177,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_alloc - Allocate an object. The memory is set to zero. + * kmem_cache_zalloc - Allocate an object. The memory is set to zero. * @cache: The cache to allocate from. * @flags: See kmalloc(). * diff --git a/mm/swap.c b/mm/swap.c index 8fd095c4ae5..687686a61f7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -54,6 +54,26 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/** + * put_pages_list(): release a list of pages + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + * + * @pages: list of pages threaded on page->lru + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the diff --git a/mm/swap_state.c b/mm/swap_state.c index fccbd9bba77..5f7cf2a4cb5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,7 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = RW_LOCK_UNLOCKED, + .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, diff --git a/mm/swapfile.c b/mm/swapfile.c index e70d6c6d6fe..f1f5ec78378 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -442,11 +442,12 @@ int swap_type_of(dev_t device) if (!(swap_info[i].flags & SWP_WRITEOK)) continue; + if (!device) { spin_unlock(&swap_lock); return i; } - inode = swap_info->swap_file->f_dentry->d_inode; + inode = swap_info[i].swap_file->f_dentry->d_inode; if (S_ISBLK(inode->i_mode) && device == MKDEV(imajor(inode), iminor(inode))) { spin_unlock(&swap_lock); diff --git a/mm/truncate.c b/mm/truncate.c index cf1b015df4a..c6ab55ec688 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -68,10 +68,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return 0; write_lock_irq(&mapping->tree_lock); - if (PageDirty(page)) { - write_unlock_irq(&mapping->tree_lock); - return 0; - } + if (PageDirty(page)) + goto failed; + if (page_count(page) != 2) /* caller's ref + pagecache ref */ + goto failed; BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); @@ -79,6 +79,9 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; +failed: + write_unlock_irq(&mapping->tree_lock); + return 0; } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35f8553f893..266162d2ba2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -330,6 +330,8 @@ void __vunmap(void *addr, int deallocate_pages) return; } + debug_check_no_locks_freed(addr, area->size); + if (deallocate_pages) { int i; @@ -338,7 +340,7 @@ void __vunmap(void *addr, int deallocate_pages) __free_page(area->pages[i]); } - if (area->nr_pages > PAGE_SIZE/sizeof(struct page *)) + if (area->flags & VM_VPAGES) vfree(area->pages); else kfree(area->pages); @@ -425,9 +427,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ - if (array_size > PAGE_SIZE) + if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); - else + area->flags |= VM_VPAGES; + } else pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); area->pages = pages; if (!area->pages) { diff --git a/mm/vmscan.c b/mm/vmscan.c index ff2ebe9458a..5d4c4d02254 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1503,10 +1503,6 @@ module_init(kswapd_init) * * If non-zero call zone_reclaim when the number of free pages falls below * the watermarks. - * - * In the future we may add flags to the mode. However, the page allocator - * should only have to check that zone_reclaim_mode != 0 before calling - * zone_reclaim(). */ int zone_reclaim_mode __read_mostly; @@ -1524,6 +1520,12 @@ int zone_reclaim_mode __read_mostly; #define ZONE_RECLAIM_PRIORITY 4 /* + * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * occur. + */ +int sysctl_min_unmapped_ratio = 1; + +/* * Try to free up some pages from this zone through reclaim. */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -1590,18 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; /* - * Do not reclaim if there are not enough reclaimable pages in this - * zone that would satify this allocations. + * Zone reclaim reclaims unmapped file backed pages. * - * All unmapped pagecache pages are reclaimable. - * - * Both counters may be temporarily off a bit so we use - * SWAP_CLUSTER_MAX as the boundary. It may also be good to - * leave a few frequently used unmapped pagecache pages around. + * A small portion of unmapped file backed pages is needed for + * file I/O otherwise pages read by file I/O will be immediately + * thrown out if the zone is overallocated. So we do not reclaim + * if less than a specified percentage of the zone is used by + * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) - return 0; + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) + return 0; /* * Avoid concurrent zone reclaims, do not reclaim in a zone that does diff --git a/mm/vmstat.c b/mm/vmstat.c index 73b83d67bab..c1b5f4106b3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,6 +12,7 @@ #include <linux/config.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/cpu.h> void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) @@ -81,6 +82,7 @@ void all_vm_events(unsigned long *ret) { sum_vm_events(ret, &cpu_online_map); } +EXPORT_SYMBOL_GPL(all_vm_events); #ifdef CONFIG_HOTPLUG /* @@ -113,17 +115,72 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -#define STAT_THRESHOLD 32 +static int calculate_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->present_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} /* - * Determine pointer to currently valid differential byte given a zone and - * the item number. - * - * Preemption must be off + * Refresh the thresholds for each zone. */ -static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) +static void refresh_zone_stat_thresholds(void) { - return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; + struct zone *zone; + int cpu; + int threshold; + + for_each_zone(zone) { + + if (!zone->present_pages) + continue; + + threshold = calculate_threshold(zone); + + for_each_online_cpu(cpu) + zone_pcp(zone, cpu)->stat_threshold = threshold; + } } /* @@ -132,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - s8 *p; + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; long x; - p = diff_pointer(zone, item); x = delta + *p; - if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; } EXPORT_SYMBOL(__mod_zone_page_state); @@ -171,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state); * No overflow check is necessary and therefore the differential can be * incremented or decremented in place which may allow the compilers to * generate better code. - * * The increment or decrement is known and therefore one boundary check can * be omitted. * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * * Some processors have inc/dec instructions that are atomic vs an interrupt. * However, the code must first determine the differential location in a zone * based on the processor number and then inc/dec the counter. There is no @@ -184,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state); */ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)++; - if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; + if (unlikely(*p > pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; } } @@ -203,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { struct zone *zone = page_zone(page); - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)--; - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; + if (unlikely(*p < - pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; } } EXPORT_SYMBOL(__dec_zone_page_state); @@ -238,19 +302,9 @@ EXPORT_SYMBOL(inc_zone_page_state); void dec_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; - struct zone *zone; - s8 *p; - zone = page_zone(page); local_irq_save(flags); - p = diff_pointer(zone, item); - - (*p)--; - - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; - } + __dec_zone_page_state(page, item); local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); @@ -524,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) pageset->pcp[j].high, pageset->pcp[j].batch); } +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); +#endif } seq_printf(m, "\n all_unreclaimable: %u" @@ -612,3 +670,35 @@ struct seq_operations vmstat_op = { #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DEAD: + refresh_zone_stat_thresholds(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; + +int __init setup_vmstat(void) +{ + refresh_zone_stat_thresholds(); + register_cpu_notifier(&vmstat_notifier); + return 0; +} +module_init(setup_vmstat) +#endif |