diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 5 | ||||
-rw-r--r-- | mm/ksm.c | 10 | ||||
-rw-r--r-- | mm/memcontrol.c | 127 | ||||
-rw-r--r-- | mm/memory-failure.c | 59 | ||||
-rw-r--r-- | mm/memory.c | 14 | ||||
-rw-r--r-- | mm/mempolicy.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 44 | ||||
-rw-r--r-- | mm/page-writeback.c | 33 | ||||
-rw-r--r-- | mm/page_alloc.c | 3 | ||||
-rw-r--r-- | mm/percpu.c | 112 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/swapfile.c | 12 | ||||
-rw-r--r-- | mm/vmalloc.c | 50 | ||||
-rw-r--r-- | mm/vmscan.c | 22 |
22 files changed, 321 insertions, 238 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 24776072959..fd3386242cf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -67,7 +67,7 @@ config DISCONTIGMEM config SPARSEMEM def_bool y - depends on SPARSEMEM_MANUAL + depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL config FLATMEM def_bool y @@ -129,7 +129,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64 || SUPERH || S390) + depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 @@ -224,7 +224,9 @@ config KSM the many instances by a single resident page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.txt for more information. + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" @@ -244,10 +246,12 @@ config DEFAULT_MMAP_MIN_ADDR This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. +config ARCH_SUPPORTS_MEMORY_FAILURE + bool config MEMORY_FAILURE depends on MMU - depends on X86_MCE + depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" help Enables code to recover from some memory failures on systems diff --git a/mm/Makefile b/mm/Makefile index 515fd793c17..ebf849042ed 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,14 +5,14 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o pagewalk.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o mmu_context.o \ - pagewalk.o $(mmu-y) + $(mmu-y) obj-y += init-mm.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3d3accb1f80..1065b715ef6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WriteBack threads:%8lu\n" + "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" @@ -610,6 +610,21 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) kthread_stop(wb->task); } +/* + * This bdi is going away now, make sure that no super_blocks point to it + */ +static void bdi_prune_sb(struct backing_dev_info *bdi) +{ + struct super_block *sb; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdi == bdi) + sb->s_bdi = NULL; + } + spin_unlock(&sb_lock); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { @@ -682,6 +697,7 @@ void bdi_destroy(struct backing_dev_info *bdi) spin_unlock(&inode_lock); } + bdi_prune_sb(bdi); bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) diff --git a/mm/filemap.c b/mm/filemap.c index 6c84e598b4a..ef169f37156 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1611,7 +1611,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 427dfe3ce78..1888b2d71bb 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -296,7 +296,7 @@ out: } } -static struct vm_operations_struct xip_file_vm_ops = { +static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6f048fcc749..5d7601b0287 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1721,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4ea4510e299..8bf765c4f58 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log) */ rcu_read_lock(); object = create_object((unsigned long)log->ptr, log->size, - log->min_count, GFP_KERNEL); + log->min_count, GFP_ATOMIC); + if (!object) + goto out; spin_lock_irqsave(&object->lock, flags); for (i = 0; i < log->trace_len; i++) object->trace[i] = log->trace[i]; object->trace_len = log->trace_len; spin_unlock_irqrestore(&object->lock, flags); +out: rcu_read_unlock(); } @@ -184,11 +184,6 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) -static void __init ksm_init_max_kernel_pages(void) -{ - ksm_max_kernel_pages = nr_free_buffer_pages() / 4; -} - static int __init ksm_slab_init(void) { rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); @@ -1673,7 +1668,7 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_init_max_kernel_pages(); + ksm_max_kernel_pages = totalram_pages / 4; err = ksm_slab_init(); if (err) @@ -1697,6 +1692,9 @@ static int __init ksm_init(void) kthread_stop(ksm_thread); goto out_free2; } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + #endif /* CONFIG_SYSFS */ return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2b98a6875c..f99f5991d6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *page) static void __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, if (mz->on_tree) return; - mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; while (*p) { parent = *p; mz_node = rb_entry(parent, struct mem_cgroup_per_zone, @@ -353,16 +356,6 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, } static void -mem_cgroup_insert_exceeded(struct mem_cgroup *mem, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - spin_lock(&mctz->lock); - __mem_cgroup_insert_exceeded(mem, mz, mctz); - spin_unlock(&mctz->lock); -} - -static void mem_cgroup_remove_exceeded(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) @@ -392,34 +385,36 @@ static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) { - unsigned long long prev_usage_in_excess, new_usage_in_excess; - bool updated_tree = false; + unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - - mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); + int nid = page_to_nid(page); + int zid = page_zonenum(page); mctz = soft_limit_tree_from_page(page); /* - * We do updates in lazy mode, mem's are removed - * lazily from the per-zone, per-node rb tree + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. */ - prev_usage_in_excess = mz->usage_in_excess; - - new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); - if (prev_usage_in_excess) { - mem_cgroup_remove_exceeded(mem, mz, mctz); - updated_tree = true; - } - if (!new_usage_in_excess) - goto done; - mem_cgroup_insert_exceeded(mem, mz, mctz); - -done: - if (updated_tree) { - spin_lock(&mctz->lock); - mz->usage_in_excess = new_usage_in_excess; - spin_unlock(&mctz->lock); + for (; mem; mem = parent_mem_cgroup(mem)) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + excess = res_counter_soft_limit_excess(&mem->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mem, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + } } } @@ -447,9 +442,10 @@ static struct mem_cgroup_per_zone * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz = NULL; + struct mem_cgroup_per_zone *mz; retry: + mz = NULL; rightmost = rb_last(&mctz->rb_root); if (!rightmost) goto done; /* Nothing to reclaim from */ @@ -1270,9 +1266,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom, struct page *page) { - struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; + struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct res_counter *fail_res, *soft_fail_res = NULL; + struct res_counter *fail_res; if (unlikely(test_thread_flag(TIF_MEMDIE))) { /* Don't account this! */ @@ -1304,17 +1300,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (mem_cgroup_is_root(mem)) goto done; - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, - &soft_fail_res); + ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); if (likely(!ret)) { if (!do_swap_account) break; ret = res_counter_charge(&mem->memsw, PAGE_SIZE, - &fail_res, NULL); + &fail_res); if (likely(!ret)) break; /* mem+swap counter fails */ - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + res_counter_uncharge(&mem->res, PAGE_SIZE); flags |= MEM_CGROUP_RECLAIM_NOSWAP; mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); @@ -1353,16 +1348,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, } } /* - * Insert just the ancestor, we should trickle down to the correct - * cgroup for reclaim, since the other nodes will be below their - * soft limit + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. */ - if (soft_fail_res) { - mem_over_soft_limit = - mem_cgroup_from_res_counter(soft_fail_res, res); - if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) - mem_cgroup_update_tree(mem_over_soft_limit, page); - } + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); done: return 0; nomem: @@ -1437,10 +1427,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + res_counter_uncharge(&mem->res, PAGE_SIZE); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, - NULL); + res_counter_uncharge(&mem->memsw, PAGE_SIZE); } css_put(&mem->css); return; @@ -1519,7 +1508,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, goto out; if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->res, PAGE_SIZE, NULL); + res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1539,7 +1528,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, } if (do_swap_account && !mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); + res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); css_get(&to->css); @@ -1610,9 +1599,9 @@ uncharge: css_put(&parent->css); /* uncharge if move fails */ if (!mem_cgroup_is_root(parent)) { - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); + res_counter_uncharge(&parent->res, PAGE_SIZE); if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); + res_counter_uncharge(&parent->memsw, PAGE_SIZE); } return ret; } @@ -1803,8 +1792,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * calling css_tryget */ if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, - NULL); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } @@ -1831,9 +1819,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) if (!mem) return; if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + res_counter_uncharge(&mem->res, PAGE_SIZE); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + res_counter_uncharge(&mem->memsw, PAGE_SIZE); } css_put(&mem->css); } @@ -1848,7 +1836,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) struct page_cgroup *pc; struct mem_cgroup *mem = NULL; struct mem_cgroup_per_zone *mz; - bool soft_limit_excess = false; if (mem_cgroup_disabled()) return NULL; @@ -1888,10 +1875,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) } if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); + res_counter_uncharge(&mem->res, PAGE_SIZE); if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + res_counter_uncharge(&mem->memsw, PAGE_SIZE); } if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true); @@ -1908,7 +1895,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); - if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) + if (mem_cgroup_soft_limit_check(mem)) mem_cgroup_update_tree(mem, page); /* at swapout, this memcg will be accessed to record to swap */ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) @@ -1986,7 +1973,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * This memcg can be obsolete one. We avoid calling css_tryget */ if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } @@ -2233,6 +2220,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long reclaimed; int loop = 0; struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; if (order > 0) return 0; @@ -2284,9 +2272,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - mz->usage_in_excess = - res_counter_soft_limit_excess(&mz->mem->res); __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->mem->res); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. @@ -2295,8 +2282,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * memory to reclaim from. Consider this as a longer * term TODO. */ - if (mz->usage_in_excess) - __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); spin_unlock(&mctz->lock); css_put(&mz->mem->css); loop++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 729d4b15b64..dacc6418387 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -35,6 +35,7 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/sched.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/pagemap.h> #include <linux/swap.h> @@ -370,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) int ret = FAILED; struct address_space *mapping; - if (!isolate_lru_page(p)) - page_cache_release(p); - /* * For anonymous pages we're done the only reference left * should be the one m_f() holds. @@ -498,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { - int ret = FAILED; - ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = DELAYED; - } - - return ret; + return DELAYED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { - int ret = FAILED; - - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = RECOVERED; - } delete_from_swap_cache(p); - return ret; + + return RECOVERED; } /* @@ -611,8 +597,6 @@ static struct page_state { { 0, 0, "unknown page state", me_unknown }, }; -#undef lru - static void action_result(unsigned long pfn, char *msg, int result) { struct page *page = NULL; @@ -629,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn, int ref) { int result; + int count; result = ps->action(p, pfn); action_result(pfn, ps->msg, result); - if (page_count(p) != 1 + ref) + + count = page_count(p) - 1 - ref; + if (count != 0) printk(KERN_ERR "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, page_count(p) - 1); + pfn, ps->msg, count); /* Could do more checks here if page looks ok */ /* @@ -661,12 +648,9 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int i; int kill = 1; - if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) return; - if (!PageLRU(p)) - lru_add_drain_all(); - /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. @@ -738,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int __memory_failure(unsigned long pfn, int trapno, int ref) { + unsigned long lru_flag; struct page_state *ps; struct page *p; int res; @@ -775,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) } /* + * We ignore non-LRU pages for good reasons. + * - PG_locked is only well defined for LRU pages and a few others + * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageSlab*() (and more non-atomic ops) + * The check (unnecessarily) ignores LRU pages being isolated and + * walked by the page reclaim code, however that's not a big loss. + */ + if (!PageLRU(p)) + lru_add_drain_all(); + lru_flag = p->flags & lru; + if (isolate_lru_page(p)) { + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; + } + page_cache_release(p); + + /* * Lock the page and wait for writeback to finish. * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. @@ -790,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) /* * Torn down by someone else? */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, "already truncated LRU", IGNORED); res = 0; goto out; @@ -798,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) res = -EBUSY; for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { + if (((p->flags | lru_flag)& ps->mask) == ps->res) { res = page_action(ps, p, pfn, ref); break; } diff --git a/mm/memory.c b/mm/memory.c index 7e91b5f9f69..6ab19dd4a19 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; @@ -654,6 +655,8 @@ again: src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + orig_src_pte = src_pte; + orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { @@ -677,9 +680,9 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(src_pte - 1); + pte_unmap_nested(orig_src_pte); add_mm_rss(dst_mm, rss[0], rss[1]); - pte_unmap_unlock(dst_pte - 1, dst_ptl); + pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (addr != end) goto again; @@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte, token, addr, data); + err = fn(pte++, token, addr, data); if (err) break; - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -2539,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } else if (PageHWPoison(page)) { ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - goto out; + goto out_release; } lock_page(page); @@ -2611,6 +2614,7 @@ out_nomap: pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); +out_release: page_cache_release(page); return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7dd9d9f8069..4545d594424 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len, err = migrate_prep(); if (err) - return err; + goto mpol_out; } { NODEMASK_SCRATCH(scratch); @@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len, err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } - if (err) { - mpol_put(new); - return err; - } + if (err) + goto mpol_out; + vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1058,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; - } + } else + putback_lru_pages(&pagelist); up_write(&mm->mmap_sem); + mpol_out: mpol_put(new); return err; } diff --git a/mm/mmap.c b/mm/mmap.c index 21d4029a07b..73f5e4b6401 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2282,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; diff --git a/mm/nommu.c b/mm/nommu.c index 56a446f0597..5189b5aed8c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -79,7 +79,7 @@ static struct kmem_cache *vm_region_jar; struct rb_root nommu_region_tree = RB_ROOT; DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; /* @@ -826,7 +826,7 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -1034,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1051,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) */ static int do_mmap_private(struct vm_area_struct *vma, struct vm_region *region, - unsigned long len) + unsigned long len, + unsigned long capabilities) { struct page *pages; unsigned long total, point, n, rlen; @@ -1062,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1181,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1193,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, return ret; } + /* we ignore the address hint */ + addr = 0; + /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); @@ -1306,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { @@ -1330,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; - add_nommu_region(region); - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, region, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error_put_region; + goto error_just_free; + add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1356,19 +1359,6 @@ share: kleave(" = %lx", result); return result; -error_put_region: - __put_nommu_region(region); - if (vma) { - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } - kmem_cache_free(vm_area_cachep, vma); - } - kleave(" = %d [pr]", ret); - return ret; - error_just_free: up_write(&nommu_region_sem); error: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d99664e8607..2c5d79236ea 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -44,18 +44,21 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -474,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -563,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - schedule_timeout_interruptible(pause); + __set_current_state(TASK_INTERRUPTIBLE); + io_schedule_timeout(pause); /* * Increase the delay for each loop, up to our previous @@ -579,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before @@ -590,10 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, nr_writeback); + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -640,9 +644,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf720550b44..cdcedf66161 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2183,7 +2183,7 @@ void show_free_areas(void) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), @@ -2196,7 +2196,6 @@ void show_free_areas(void) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), diff --git a/mm/percpu.c b/mm/percpu.c index 43d8cacfdaa..d90797160c2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit; * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. + * allocations are done using GFP_KERNEL with pcpu_lock released. In + * general, percpu memory can't be allocated with irq off but + * irqsave/restore are still used in alloc path so that it can be used + * from early init path - sched_init() specifically. * * Free path accesses and alters only the index data structures, so it * can be safely called from atomic context. When memory needs to be @@ -366,7 +369,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * RETURNS: * 0 if noop, 1 if successfully extended, -errno on failure. */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags) { int new_alloc; int *new; @@ -376,7 +379,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) if (chunk->map_alloc >= chunk->map_used + 2) return 0; - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, *flags); new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) @@ -384,7 +387,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); if (!new) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, *flags); return -ENOMEM; } @@ -393,7 +396,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) * could have happened inbetween, so map_used couldn't have * grown. */ - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, *flags); BUG_ON(new_alloc < chunk->map_used + 2); size = chunk->map_alloc * sizeof(chunk->map[0]); @@ -1043,8 +1046,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; + const char *err; int slot, off; + unsigned long flags; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " @@ -1053,17 +1059,20 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) } mutex_lock(&pcpu_alloc_mutex); - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + pcpu_extend_area_map(chunk, &flags) < 0) { + err = "failed to extend area map of reserved chunk"; goto fail_unlock; + } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -1074,12 +1083,13 @@ restart: if (size > chunk->contig_hint) continue; - switch (pcpu_extend_area_map(chunk)) { + switch (pcpu_extend_area_map(chunk, &flags)) { case 0: break; case 1: goto restart; /* pcpu_lock dropped, restart */ default: + err = "failed to extend area map"; goto fail_unlock; } @@ -1090,23 +1100,26 @@ restart: } /* hmmm... no space left, create a new chunk */ - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); chunk = alloc_pcpu_chunk(); - if (!chunk) + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); goto restart; area_found: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } @@ -1116,9 +1129,16 @@ area_found: return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; } @@ -1347,6 +1367,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( struct pcpu_alloc_info *ai; unsigned int *cpu_map; + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_map)); + /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest @@ -1574,6 +1598,7 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { + static char cpus_buf[4096] __initdata; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; @@ -1585,17 +1610,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int *unit_map; int group, unit, i; + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(ai->nr_groups <= 0); - BUG_ON(!ai->static_size); - BUG_ON(!base_addr); - BUG_ON(ai->unit_size < size_sum); - BUG_ON(ai->unit_size & ~PAGE_MASK); - BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); - - pcpu_dump_alloc_info(KERN_DEBUG, ai); + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); /* process group information and build config tables accordingly */ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); @@ -1604,7 +1638,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) - unit_map[cpu] = NR_CPUS; + unit_map[cpu] = UINT_MAX; pcpu_first_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { @@ -1618,8 +1652,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (cpu == NR_CPUS) continue; - BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); - BUG_ON(unit_map[cpu] != NR_CPUS); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; @@ -1632,7 +1667,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_units = unit; for_each_possible_cpu(cpu) - BUG_ON(unit_map[cpu] == NR_CPUS); + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_INFO, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; @@ -1782,7 +1821,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, areas_size; + size_t size_sum, areas_size, max_distance; int group, i, rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, @@ -1832,8 +1871,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* base address is now known, determine group base offsets */ - for (group = 0; group < ai->nr_groups; group++) + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; + max_distance = max_t(size_t, max_distance, + ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, diff --git a/mm/rmap.c b/mm/rmap.c index 28aafe2b530..dd43373a483 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -242,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used on anon pages, by unuse_vma; + * At what user virtual address is page expected in vma? + * checking that the page matches the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { diff --git a/mm/shmem.c b/mm/shmem.c index 98631c26c20..356dd99566e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -218,7 +218,7 @@ static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); @@ -2497,7 +2498,7 @@ static const struct super_operations shmem_ops = { .put_super = shmem_put_super, }; -static struct vm_operations_struct shmem_vm_ops = { +static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, diff --git a/mm/swapfile.c b/mm/swapfile.c index 4de7f02f820..a1bc6b9af9a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1974,12 +1974,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + if (p->bdev) { + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; } - if (discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; mutex_lock(&swapon_mutex); spin_lock(&swap_lock); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 69511e66323..0f551a4a44c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -25,10 +26,10 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> -#include <linux/highmem.h> #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> +#include <asm/shmparam.h> /*** Page table manipulation functions ***/ @@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, } static struct vm_struct *__get_vm_area_node(unsigned long size, - unsigned long flags, unsigned long start, unsigned long end, - int node, gfp_t gfp_mask, void *caller) + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); @@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, caller); } @@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); } struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, - gfp_mask, __builtin_return_address(0)); + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + node, gfp_mask, __builtin_return_address(0)); } static struct vm_struct *find_vm_area(const void *addr) @@ -1403,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) @@ -1417,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { @@ -1476,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size + * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1485,8 +1487,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) { struct vm_struct *area; void *addr; @@ -1496,8 +1499,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, - node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, + VMALLOC_END, node, gfp_mask, caller); if (!area) return NULL; @@ -1516,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, gfp_mask, prot, -1, + return __vmalloc_node(size, 1, gfp_mask, prot, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -1532,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -1549,7 +1552,8 @@ void *vmalloc_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + ret = __vmalloc_node(size, SHMLBA, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); @@ -1572,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1595,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, -1, __builtin_return_address(0)); } @@ -1616,7 +1620,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1633,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1219ceb8a9b..777af57fd8c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -544,6 +544,16 @@ redo: */ lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); + /* + * When racing with an mlock clearing (page is + * unlocked), make sure that if the other thread does + * not observe our setting of PG_lru and fails + * isolation, we see PG_mlocked cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked(). + */ + smp_mb(); } /* @@ -1088,7 +1098,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, int lumpy_reclaim = 0; while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -1356,7 +1366,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { list_add(&page->lru, &l_active); continue; } @@ -1709,10 +1719,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed |