From aff622495c9a0b56148192e53bdec539f5e147f2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:52 -0700 Subject: vmscan: only defer compaction for failed order and higher Currently a failed order-9 (transparent hugepage) compaction can lead to memory compaction being temporarily disabled for a memory zone. Even if we only need compaction for an order 2 allocation, eg. for jumbo frames networking. The fix is relatively straightforward: keep track of the highest order at which compaction is succeeding, and only defer compaction for orders at which compaction is failing. Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a13ded1938f..572b93ea475 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; - if (compaction_deferred(preferred_zone)) { + if (compaction_deferred(preferred_zone, order)) { *deferred_compaction = true; return NULL; } @@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (page) { preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; + if (order >= preferred_zone->compact_order_failed) + preferred_zone->compact_order_failed = order + 1; count_vm_event(COMPACTSUCCESS); return page; } @@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * defer if the failure was a sync compaction failure. */ if (sync_migration) - defer_compaction(preferred_zone); + defer_compaction(preferred_zone, order); cond_resched(); } -- cgit v1.2.3-70-g09d2 From 08ab9b10d43aca091fdff58b69fc1ec89c5b8a83 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:04 -0700 Subject: mm, oom: force oom kill on sysrq+f The oom killer chooses not to kill a thread if: - an eligible thread has already been oom killed and has yet to exit, and - an eligible thread is exiting but has yet to free all its memory and is not the thread attempting to currently allocate memory. SysRq+F manually invokes the global oom killer to kill a memory-hogging task. This is normally done as a last resort to free memory when no progress is being made or to test the oom killer itself. For both uses, we always want to kill a thread and never defer. This patch causes SysRq+F to always kill an eligible thread and can be used to force a kill even if another oom killed thread has failed to exit. Signed-off-by: David Rientjes Acked-by: KOSAKI Motohiro Acked-by: Pekka Enberg Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 2 +- include/linux/oom.h | 2 +- mm/oom_kill.c | 17 ++++++++++------- mm/page_alloc.c | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ecb8e2203ac..136e86faa1e 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -346,7 +346,7 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL); + out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL, true); } static DECLARE_WORK(moom_work, moom_callback); diff --git a/include/linux/oom.h b/include/linux/oom.h index 552fba9c7d5..3d7647536b0 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -49,7 +49,7 @@ extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *mask); + int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 517299c808c..f23f3345464 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -310,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, */ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *memcg, - const nodemask_t *nodemask) + const nodemask_t *nodemask, bool force_kill) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -336,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (test_tsk_thread_flag(p, TIF_MEMDIE)) { if (unlikely(frozen(p))) __thaw_task(p); - return ERR_PTR(-1UL); + if (!force_kill) + return ERR_PTR(-1UL); } if (!p->mm) continue; @@ -354,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (p == current) { chosen = p; *ppoints = 1000; - } else { + } else if (!force_kill) { /* * If this task is not being ptraced on exit, * then wait for it to finish before killing @@ -572,7 +573,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); - p = select_bad_process(&points, limit, memcg, NULL); + p = select_bad_process(&points, limit, memcg, NULL, false); if (p && PTR_ERR(p) != -1UL) oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, "Memory cgroup out of memory"); @@ -687,6 +688,7 @@ static void clear_system_oom(void) * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) @@ -694,7 +696,7 @@ static void clear_system_oom(void) * don't have to be perfect here, we just have to be good. */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask) + int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; struct task_struct *p; @@ -738,7 +740,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, goto out; } - p = select_bad_process(&points, totalpages, NULL, mpol_mask); + p = select_bad_process(&points, totalpages, NULL, mpol_mask, + force_kill); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { dump_header(NULL, gfp_mask, order, NULL, mpol_mask); @@ -770,7 +773,7 @@ out: void pagefault_out_of_memory(void) { if (try_set_system_oom()) { - out_of_memory(NULL, 0, 0, NULL); + out_of_memory(NULL, 0, 0, NULL, false); clear_system_oom(); } if (!test_thread_flag(TIF_MEMDIE)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 572b93ea475..98552cf1da8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, nodemask); + out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: clear_zonelist_oom(zonelist, gfp_mask); -- cgit v1.2.3-70-g09d2 From f0cb3c76ae1ced85f9034480b1b24cd96530ec78 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:34:06 -0700 Subject: mm: drain percpu lru add/rotate page-vectors on cpu hot-unplug This cpu hotplug hook was accidentally removed in commit 00a62ce91e55 ("mm: fix Committed_AS underflow on large NR_CPUS environment") The visible effect of this accident: some pages are borrowed in per-cpu page-vectors. Truncate can deal with it, but these pages cannot be reused while this cpu is offline. So this is like a temporary memory leak. Signed-off-by: Konstantin Khlebnikov Cc: Dave Hansen Cc: KOSAKI Motohiro Cc: Eric B Munson Cc: Mel Gorman Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/page_alloc.c | 1 + mm/swap.c | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 64a7dba6784..b86b5c20617 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -223,6 +223,7 @@ extern void lru_add_page_tail(struct zone* zone, extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); +extern void lru_add_drain_cpu(int cpu); extern int lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_page(struct page *page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98552cf1da8..673596ad9c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4825,6 +4825,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + lru_add_drain_cpu(cpu); drain_pages(cpu); /* diff --git a/mm/swap.c b/mm/swap.c index 14380e9fbe3..5c13f133897 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg) * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ -static void drain_cpu_pagevecs(int cpu) +void lru_add_drain_cpu(int cpu) { struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); struct pagevec *pvec; @@ -553,7 +553,7 @@ void deactivate_page(struct page *page) void lru_add_drain(void) { - drain_cpu_pagevecs(get_cpu()); + lru_add_drain_cpu(get_cpu()); put_cpu(); } -- cgit v1.2.3-70-g09d2 From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 47 ++++++++++++++++++++--------------------------- include/linux/init_task.h | 8 ++++++++ include/linux/sched.h | 2 +- kernel/cpuset.c | 43 ++++++++----------------------------------- kernel/fork.c | 1 + mm/filemap.c | 11 +++++++---- mm/hugetlb.c | 15 +++++++++++---- mm/mempolicy.c | 28 +++++++++++++++++++++------- mm/page_alloc.c | 33 +++++++++++++++++++++++---------- mm/slab.c | 13 ++++++++----- mm/slub.c | 40 +++++++++++++++++++++++++--------------- mm/vmscan.c | 2 -- 12 files changed, 133 insertions(+), 110 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e9eaec52265..7a7e5fd2a27 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); /* - * reading current mems_allowed and mempolicy in the fastpath must protected - * by get_mems_allowed() + * get_mems_allowed is required when making decisions involving mems_allowed + * such as during page allocation. mems_allowed can be updated in parallel + * and depending on the new value an operation can fail potentially causing + * process failure. A retry loop with get_mems_allowed and put_mems_allowed + * prevents these artificial failures. */ -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { - current->mems_allowed_change_disable++; - - /* - * ensure that reading mems_allowed and mempolicy happens after the - * update of ->mems_allowed_change_disable. - * - * the write-side task finds ->mems_allowed_change_disable is not 0, - * and knows the read-side task is reading mems_allowed or mempolicy, - * so it will clear old bits lazily. - */ - smp_mb(); + return read_seqcount_begin(¤t->mems_allowed_seq); } -static inline void put_mems_allowed(void) +/* + * If this returns false, the operation that took place after get_mems_allowed + * may have failed. It is up to the caller to retry the operation if + * appropriate. + */ +static inline bool put_mems_allowed(unsigned int seq) { - /* - * ensure that reading mems_allowed and mempolicy before reducing - * mems_allowed_change_disable. - * - * the write-side task will know that the read-side task is still - * reading mems_allowed or mempolicy, don't clears old bits in the - * nodemask. - */ - smp_mb(); - --ACCESS_ONCE(current->mems_allowed_change_disable); + return !read_seqcount_retry(¤t->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { task_lock(current); + write_seqcount_begin(¤t->mems_allowed_seq); current->mems_allowed = nodemask; + write_seqcount_end(¤t->mems_allowed_seq); task_unlock(current); } @@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { + return 0; } -static inline void put_mems_allowed(void) +static inline bool put_mems_allowed(unsigned int seq) { + return true; } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f994d51f70f..e4baff5f7ff 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -29,6 +29,13 @@ extern struct fs_struct init_fs; #define INIT_GROUP_RWSEM(sig) #endif +#ifdef CONFIG_CPUSETS +#define INIT_CPUSET_SEQ \ + .mems_allowed_seq = SEQCNT_ZERO, +#else +#define INIT_CPUSET_SEQ +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ @@ -192,6 +199,7 @@ extern struct cred init_cred; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_CPUSET_SEQ \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index e074e1e54f8..0c147a4260a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1514,7 +1514,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ - int mems_allowed_change_disable; + seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5d575836dba..1010cc61931 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index a9e99f3c18e..9cc227d5410 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1237,6 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; + seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; diff --git a/mm/filemap.c b/mm/filemap.c index f3230604006..843042045dc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -499,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp) struct page *page; if (cpuset_do_page_mem_spread()) { - get_mems_allowed(); - n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); - put_mems_allowed(); + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + return page; } return alloc_pages(gfp, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 62f9fada4d6..b1c31487733 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -454,14 +454,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + unsigned int cpuset_mems_cookie; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask, &mpol, &nodemask); /* @@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } } } -err: + mpol_cond_put(mpol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; + +err: + mpol_cond_put(mpol); + return NULL; } static void update_and_free_page(struct hstate *h, struct page *page) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71e1a523e20..cfb6c867875 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1850,18 +1850,24 @@ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { - struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct mempolicy *pol; struct zonelist *zl; struct page *page; + unsigned int cpuset_mems_cookie; + +retry_cpuset: + pol = get_vma_policy(current, vma, addr); + cpuset_mems_cookie = get_mems_allowed(); - get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } zl = policy_zonelist(gfp, pol, node); @@ -1872,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } /* @@ -1880,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, */ page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } @@ -1907,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; struct page *page; + unsigned int cpuset_mems_cookie; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -1922,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 673596ad9c8..40de6854b98 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2380,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; - struct page *page; + struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; gfp_mask &= gfp_allowed_mask; @@ -2400,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); - if (!preferred_zone) { - put_mems_allowed(); - return NULL; - } + if (!preferred_zone) + goto out; /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2418,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); + +out: + /* + * When updating a task's mems_allowed, it is possible to race with + * parallel threads in such a way that an allocation can fail while + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2634,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; + unsigned int cpuset_mems_cookie; if (!(flags & SHOW_MEM_FILTER_NODES)) goto out; - get_mems_allowed(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - put_mems_allowed(); + do { + cpuset_mems_cookie = get_mems_allowed(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + } while (!put_mems_allowed(cpuset_mems_cookie)); out: return ret; } diff --git a/mm/slab.c b/mm/slab.c index f0bd7857ab3..29c8716eb7a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_mem_id(); - get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); - put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; + unsigned int cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + retry: /* * Look through allowed nodes for objects available @@ -3372,7 +3373,9 @@ retry: } } } - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index 4907563ef7f..f4a6229848f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *object; + unsigned int cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - struct kmem_cache_node *n; - - n = get_node(s, zone_to_nid(zone)); - - if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); - if (object) { - put_mems_allowed(); - return object; + do { + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, c); + if (object) { + /* + * Return the object even if + * put_mems_allowed indicated that + * the cpuset mems_allowed was + * updated in parallel. It's a + * harmless race between the alloc + * and the cpuset update. + */ + put_mems_allowed(cpuset_mems_cookie); + return object; + } } } - } - put_mems_allowed(); + } while (!put_mems_allowed(cpuset_mems_cookie)); #endif return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 440af1d899b..55d86c9506f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2343,7 +2343,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long writeback_threshold; bool aborted_reclaim; - get_mems_allowed(); delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2407,7 +2406,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, out: delayacct_freepages_end(); - put_mems_allowed(); if (sc->nr_reclaimed) return sc->nr_reclaimed; -- cgit v1.2.3-70-g09d2 From 8d13bddd11c10db40e2c81b4b224c11126691fc0 Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Wed, 21 Mar 2012 16:34:15 -0700 Subject: page_alloc.c: remove add_from_early_node_map() add_from_early_node_map() is unused. Signed-off-by: Kautuk Consul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- mm/page_alloc.c | 12 ------------ 2 files changed, 14 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce2b2a3b287..b1c8318e32b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1295,8 +1295,6 @@ extern void get_pfn_range_for_nid(unsigned int nid, extern unsigned long find_min_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -int add_from_early_node_map(struct range *range, int az, - int nr_range, int nid); extern void sparse_memory_present_with_active_regions(int nid); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40de6854b98..70bbd0f9c38 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3940,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) } } -int __init add_from_early_node_map(struct range *range, int az, - int nr_range, int nid) -{ - unsigned long start_pfn, end_pfn; - int i; - - /* need to go over early_node_map to find out good range for node */ - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) - nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); - return nr_range; -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3-70-g09d2 From b224ef856b1a5b949daff5937a9e187fe622b8f5 Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Wed, 21 Mar 2012 16:34:15 -0700 Subject: page_alloc: remove unused find_zone_movable_pfns_for_nodes() argument find_zone_movable_pfns_for_nodes() does not use its argument. Signed-off-by: Kautuk Consul Cc: David Rientjes Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70bbd0f9c38..caea788628e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4524,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void) * memory. When they don't, some nodes will have more kernelcore than * others */ -static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +static void __init find_zone_movable_pfns_for_nodes(void) { int i, nid; unsigned long usable_startpfn; @@ -4716,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); - find_zone_movable_pfns_for_nodes(zone_movable_pfn); + find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); -- cgit v1.2.3-70-g09d2 From 29fd66d289f2981e11c550f8b411a6d3d38be0cf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 28 Mar 2012 14:42:41 -0700 Subject: mm, coredump: fail allocations when coredumping instead of oom killing The size of coredump files is limited by RLIMIT_CORE, however, allocating large amounts of memory results in three negative consequences: - the coredumping process may be chosen for oom kill and quickly deplete all memory reserves in oom conditions preventing further progress from being made or tasks from exiting, - the coredumping process may cause other processes to be oom killed without fault of their own as the result of a SIGSEGV, for example, in the coredumping process, or - the coredumping process may result in a livelock while writing to the dump file if it needs memory to allocate while other threads are in the exit path waiting on the coredumper to complete. This is fixed by implying __GFP_NORETRY in the page allocator for coredumping processes when reclaim has failed so the allocations fail and the process continues to exit. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caea788628e..c313afcc8e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2308,6 +2308,10 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; + /* Coredumps can quickly deplete all memory reserves */ + if ((current->flags & PF_DUMPCORE) && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, -- cgit v1.2.3-70-g09d2 From 74046494ea68676d29ef6501a4bd950f08112a2c Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:45 -0700 Subject: mm: only IPI CPUs to drain local pages if they exist Calculate a cpumask of CPUs with per-cpu pages in any zone and only send an IPI requesting CPUs to drain these pages to the buddy allocator if they actually have pages when asked to flush. This patch saves 85%+ of IPIs asking to drain per-cpu pages in case of severe memory pressure that leads to OOM since in these cases multiple, possibly concurrent, allocation requests end up in the direct reclaim code path so when the per-cpu pages end up reclaimed on first allocation failure for most of the proceeding allocation attempts until the memory pressure is off (possibly via the OOM killer) there are no per-cpu pages on most CPUs (and there can easily be hundreds of them). This also has the side effect of shortening the average latency of direct reclaim by 1 or more order of magnitude since waiting for all the CPUs to ACK the IPI takes a long time. Tested by running "hackbench 400" on a 8 CPU x86 VM and observing the difference between the number of direct reclaim attempts that end up in drain_all_pages() and those were more then 1/2 of the online CPU had any per-cpu page in them, using the vmstat counters introduced in the next patch in the series and using proc/interrupts. In the test sceanrio, this was seen to save around 3600 global IPIs after trigerring an OOM on a concurrent workload: $ cat /proc/vmstat | tail -n 2 pcp_global_drain 0 pcp_global_ipi_saved 0 $ cat /proc/interrupts | grep CAL CAL: 1 2 1 2 2 2 2 2 Function call interrupts $ hackbench 400 [OOM messages snipped] $ cat /proc/vmstat | tail -n 2 pcp_global_drain 3647 pcp_global_ipi_saved 3642 $ cat /proc/interrupts | grep CAL CAL: 6 13 6 3 3 3 1 2 7 Function call interrupts Please note that if the global drain is removed from the direct reclaim path as a patch from Mel Gorman currently suggests this should be replaced with an on_each_cpu_cond invocation. Signed-off-by: Gilad Ben-Yossef Acked-by: Mel Gorman Cc: KOSAKI Motohiro Acked-by: Christoph Lameter Acked-by: Peter Zijlstra Cc: Pekka Enberg Cc: Rik van Riel Cc: Andi Kleen Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c313afcc8e5..a712fb9e04c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 1); + int cpu; + struct per_cpu_pageset *pcp; + struct zone *zone; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + bool has_pcps = false; + for_each_populated_zone(zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION -- cgit v1.2.3-70-g09d2