From 71dd0b8ae83120db9ade1380fae7a49cc6b3f465 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 May 2012 15:06:16 -0700 Subject: mm/memory_failure: let the compiler add the function name These things tend to get out of sync with time so let the compiler automatically enter the current function name using __func__. No functional change. Signed-off-by: Borislav Petkov Acked-by: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c99ad4e6b88..ab1e7145e29 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1388,16 +1388,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) */ if (!get_page_unless_zero(compound_head(p))) { if (PageHuge(p)) { - pr_info("get_any_page: %#lx free huge page\n", pfn); + pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = dequeue_hwpoisoned_huge_page(compound_head(p)); } else if (is_free_buddy_page(p)) { - pr_info("get_any_page: %#lx free buddy page\n", pfn); + pr_info("%s: %#lx free buddy page\n", __func__, pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", - pfn, p->flags); + pr_info("%s: %#lx: unknown zero refcount page type %lx\n", + __func__, pfn, p->flags); ret = -EIO; } } else { -- cgit v1.2.3-70-g09d2 From 89c522c78a6fbc0e24b71e65b65df8b247d5aebd Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 29 May 2012 15:06:16 -0700 Subject: mm/mempolicy.c: use enum value MPOL_REBIND_ONCE in mpol_rebind_policy() We have enum definition in mempolicy.h: MPOL_REBIND_ONCE. It should replace the magic number 0 for step comparison in function mpol_rebind_policy. Signed-off-by: Wang Sheng-Hui Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 88f9422b92e..a388888b6fc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == 0 && + if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; -- cgit v1.2.3-70-g09d2 From f2135a4a57a798ca8014a138afb626b96d90c842 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 29 May 2012 15:06:17 -0700 Subject: mm/hugetlb.c: use long vars instead of int in region_count() The arguments f & t and fields from & to of struct file_region are defined as long. So use long instead of int to type the temp vars. Signed-off-by: Wang Sheng-Hui Acked-by: David Rientjes Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e28416c47f..41a647dfb73 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t) /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { - int seg_from; - int seg_to; + long seg_from; + long seg_to; if (rg->to <= f) continue; -- cgit v1.2.3-70-g09d2 From aa2e878efa7949c8502c9760f92835222714f090 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 29 May 2012 15:06:17 -0700 Subject: mm, thp: remove unnecessary ret variable The "ret" variable is unnecessary in __do_huge_pmd_anonymous_page(), so remove it. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb5..8ab2d24faae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -636,7 +636,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { - int ret = 0; pgtable_t pgtable; VM_BUG_ON(!PageCompound(page)); @@ -675,7 +674,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); } - return ret; + return 0; } static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) -- cgit v1.2.3-70-g09d2 From edad9d2c337d43278a9d5aeb0ed531c2e838f8a6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 29 May 2012 15:06:17 -0700 Subject: mm, thp: allow fallback when pte_alloc_one() fails for huge pmd The transparent hugepages feature is careful to not invoke the oom killer when a hugepage cannot be allocated. pte_alloc_one() failing in __do_huge_pmd_anonymous_page(), however, currently results in VM_FAULT_OOM which invokes the pagefault oom killer to kill a memory-hogging task. This is unnecessary since it's possible to drop the reference to the hugepage and fallback to allocating a small page. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8ab2d24faae..d7d7165156c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -640,11 +640,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) { - mem_cgroup_uncharge_page(page); - put_page(page); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - } clear_huge_page(page, haddr, HPAGE_PMD_NR); __SetPageUptodate(page); @@ -723,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); goto out; } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, + page))) { + mem_cgroup_uncharge_page(page); + put_page(page); + goto out; + } - return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); + return 0; } out: /* -- cgit v1.2.3-70-g09d2 From e709ffd6169ccd259eb5874e853303e91e94e829 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 29 May 2012 15:06:18 -0700 Subject: mm: remove swap token code The swap token code no longer fits in with the current VM model. It does not play well with cgroups or the better NUMA placement code in development, since we have only one swap token globally. It also has the potential to mess with scalability of the system, by increasing the number of non-reclaimable pages on the active and inactive anon LRU lists. Last but not least, the swap token code has been broken for a year without complaints, as reported by Konstantin Khlebnikov. This suggests we no longer have much use for it. The days of sub-1G memory systems with heavy use of swap are over. If we ever need thrashing reducing code in the future, we will have to implement something that does scale. Signed-off-by: Rik van Riel Cc: Konstantin Khlebnikov Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Hugh Dickins Acked-by: Bob Picco Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 11 --- include/linux/swap.h | 35 ---------- include/trace/events/vmscan.h | 82 ---------------------- kernel/fork.c | 9 --- mm/Makefile | 2 +- mm/memcontrol.c | 1 - mm/memory.c | 2 +- mm/rmap.c | 6 -- mm/thrash.c | 155 ------------------------------------------ mm/vmscan.c | 6 -- 10 files changed, 2 insertions(+), 307 deletions(-) delete mode 100644 mm/thrash.c (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 26574c72612..dad95bdd06d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -345,17 +345,6 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - /* Swap token stuff */ - /* - * Last value of global fault stamp as seen by this process. - * In other words, this value gives an indication of how long - * it has been since this task got the token. - * Look at mm/thrash.c - */ - unsigned int faultstamp; - unsigned int token_priority; - unsigned int last_interval; - unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ diff --git a/include/linux/swap.h b/include/linux/swap.h index b1fd5c7925f..bc3073ce95c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -355,23 +355,6 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -/* linux/mm/thrash.c */ -extern struct mm_struct *swap_token_mm; -extern void grab_swap_token(struct mm_struct *); -extern void __put_swap_token(struct mm_struct *); -extern void disable_swap_token(struct mem_cgroup *memcg); - -static inline int has_swap_token(struct mm_struct *mm) -{ - return (mm == swap_token_mm); -} - -static inline void put_swap_token(struct mm_struct *mm) -{ - if (has_swap_token(mm)) - __put_swap_token(mm); -} - #ifdef CONFIG_CGROUP_MEM_RES_CTLR extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); @@ -476,24 +459,6 @@ static inline swp_entry_t get_swap_page(void) return entry; } -/* linux/mm/thrash.c */ -static inline void put_swap_token(struct mm_struct *mm) -{ -} - -static inline void grab_swap_token(struct mm_struct *mm) -{ -} - -static inline int has_swap_token(struct mm_struct *mm) -{ - return 0; -} - -static inline void disable_swap_token(struct mem_cgroup *memcg) -{ -} - static inline void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) { diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index f64560e204b..572195459d5 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -395,88 +395,6 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, show_reclaim_flags(__entry->reclaim_flags)) ); -TRACE_EVENT(replace_swap_token, - TP_PROTO(struct mm_struct *old_mm, - struct mm_struct *new_mm), - - TP_ARGS(old_mm, new_mm), - - TP_STRUCT__entry( - __field(struct mm_struct*, old_mm) - __field(unsigned int, old_prio) - __field(struct mm_struct*, new_mm) - __field(unsigned int, new_prio) - ), - - TP_fast_assign( - __entry->old_mm = old_mm; - __entry->old_prio = old_mm ? old_mm->token_priority : 0; - __entry->new_mm = new_mm; - __entry->new_prio = new_mm->token_priority; - ), - - TP_printk("old_token_mm=%p old_prio=%u new_token_mm=%p new_prio=%u", - __entry->old_mm, __entry->old_prio, - __entry->new_mm, __entry->new_prio) -); - -DECLARE_EVENT_CLASS(put_swap_token_template, - TP_PROTO(struct mm_struct *swap_token_mm), - - TP_ARGS(swap_token_mm), - - TP_STRUCT__entry( - __field(struct mm_struct*, swap_token_mm) - ), - - TP_fast_assign( - __entry->swap_token_mm = swap_token_mm; - ), - - TP_printk("token_mm=%p", __entry->swap_token_mm) -); - -DEFINE_EVENT(put_swap_token_template, put_swap_token, - TP_PROTO(struct mm_struct *swap_token_mm), - TP_ARGS(swap_token_mm) -); - -DEFINE_EVENT_CONDITION(put_swap_token_template, disable_swap_token, - TP_PROTO(struct mm_struct *swap_token_mm), - TP_ARGS(swap_token_mm), - TP_CONDITION(swap_token_mm != NULL) -); - -TRACE_EVENT_CONDITION(update_swap_token_priority, - TP_PROTO(struct mm_struct *mm, - unsigned int old_prio, - struct mm_struct *swap_token_mm), - - TP_ARGS(mm, old_prio, swap_token_mm), - - TP_CONDITION(mm->token_priority != old_prio), - - TP_STRUCT__entry( - __field(struct mm_struct*, mm) - __field(unsigned int, old_prio) - __field(unsigned int, new_prio) - __field(struct mm_struct*, swap_token_mm) - __field(unsigned int, swap_token_prio) - ), - - TP_fast_assign( - __entry->mm = mm; - __entry->old_prio = old_prio; - __entry->new_prio = mm->token_priority; - __entry->swap_token_mm = swap_token_mm; - __entry->swap_token_prio = swap_token_mm ? swap_token_mm->token_priority : 0; - ), - - TP_printk("mm=%p old_prio=%u new_prio=%u swap_token_mm=%p token_prio=%u", - __entry->mm, __entry->old_prio, __entry->new_prio, - __entry->swap_token_mm, __entry->swap_token_prio) -); - #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/kernel/fork.c b/kernel/fork.c index 47b4e4f379f..5b13eea2e75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -614,7 +614,6 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } - put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); @@ -831,10 +830,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif @@ -913,10 +908,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) goto fail_nomem; good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - tsk->mm = mm; tsk->active_mm = mm; return 0; diff --git a/mm/Makefile b/mm/Makefile index 8aada89efbb..ccecbf9818f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -25,7 +25,7 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f342778a0c0..92675fe8a2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5598,7 +5598,6 @@ static void mem_cgroup_move_task(struct cgroup *cont, if (mm) { if (mc.to) mem_cgroup_move_charge(mm); - put_swap_token(mm); mmput(mm); } if (mc.to) diff --git a/mm/memory.c b/mm/memory.c index e40f6759ba9..2bf9e110437 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } locked = lock_page_or_retry(page, mm, flags); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; diff --git a/mm/rmap.c b/mm/rmap.c index 5b5ad584ffb..0f3b7cda2a2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; - (*mapcount)--; if (referenced) diff --git a/mm/thrash.c b/mm/thrash.c deleted file mode 100644 index 57ad495dbd5..00000000000 --- a/mm/thrash.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * mm/thrash.c - * - * Copyright (C) 2004, Red Hat, Inc. - * Copyright (C) 2004, Rik van Riel - * Released under the GPL, see the file COPYING for details. - * - * Simple token based thrashing protection, using the algorithm - * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html - * - * Sep 2006, Ashwin Chaugule - * Improved algorithm to pass token: - * Each task has a priority which is incremented if it contended - * for the token in an interval less than its previous attempt. - * If the token is acquired, that task's priority is boosted to prevent - * the token from bouncing around too often and to let the task make - * some progress in its execution. - */ - -#include -#include -#include -#include -#include - -#include - -#define TOKEN_AGING_INTERVAL (0xFF) - -static DEFINE_SPINLOCK(swap_token_lock); -struct mm_struct *swap_token_mm; -static struct mem_cgroup *swap_token_memcg; - -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) -{ - struct mem_cgroup *memcg; - - memcg = try_get_mem_cgroup_from_mm(mm); - if (memcg) - css_put(mem_cgroup_css(memcg)); - - return memcg; -} -#else -static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) -{ - return NULL; -} -#endif - -void grab_swap_token(struct mm_struct *mm) -{ - int current_interval; - unsigned int old_prio = mm->token_priority; - static unsigned int global_faults; - static unsigned int last_aging; - - global_faults++; - - current_interval = global_faults - mm->faultstamp; - - if (!spin_trylock(&swap_token_lock)) - return; - - /* First come first served */ - if (!swap_token_mm) - goto replace_token; - - /* - * Usually, we don't need priority aging because long interval faults - * makes priority decrease quickly. But there is one exception. If the - * token owner task is sleeping, it never make long interval faults. - * Thus, we need a priority aging mechanism instead. The requirements - * of priority aging are - * 1) An aging interval is reasonable enough long. Too short aging - * interval makes quick swap token lost and decrease performance. - * 2) The swap token owner task have to get priority aging even if - * it's under sleep. - */ - if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { - swap_token_mm->token_priority /= 2; - last_aging = global_faults; - } - - if (mm == swap_token_mm) { - mm->token_priority += 2; - goto update_priority; - } - - if (current_interval < mm->last_interval) - mm->token_priority++; - else { - if (likely(mm->token_priority > 0)) - mm->token_priority--; - } - - /* Check if we deserve the token */ - if (mm->token_priority > swap_token_mm->token_priority) - goto replace_token; - -update_priority: - trace_update_swap_token_priority(mm, old_prio, swap_token_mm); - -out: - mm->faultstamp = global_faults; - mm->last_interval = current_interval; - spin_unlock(&swap_token_lock); - return; - -replace_token: - mm->token_priority += 2; - trace_replace_swap_token(swap_token_mm, mm); - swap_token_mm = mm; - swap_token_memcg = swap_token_memcg_from_mm(mm); - last_aging = global_faults; - goto out; -} - -/* Called on process exit. */ -void __put_swap_token(struct mm_struct *mm) -{ - spin_lock(&swap_token_lock); - if (likely(mm == swap_token_mm)) { - trace_put_swap_token(swap_token_mm); - swap_token_mm = NULL; - swap_token_memcg = NULL; - } - spin_unlock(&swap_token_lock); -} - -static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) -{ - if (!a) - return true; - if (!b) - return true; - if (a == b) - return true; - return false; -} - -void disable_swap_token(struct mem_cgroup *memcg) -{ - /* memcg reclaim don't disable unrelated mm token. */ - if (match_memcg(memcg, swap_token_memcg)) { - spin_lock(&swap_token_lock); - if (match_memcg(memcg, swap_token_memcg)) { - trace_disable_swap_token(swap_token_mm); - swap_token_mm = NULL; - swap_token_memcg = NULL; - } - spin_unlock(&swap_token_lock); - } -} diff --git a/mm/vmscan.c b/mm/vmscan.c index 33dc256033b..ca46080bb07 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2352,8 +2352,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; - if (!priority) - disable_swap_token(sc->target_mem_cgroup); aborted_reclaim = shrink_zones(priority, zonelist, sc); /* @@ -2704,10 +2702,6 @@ loop_again: unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; - /* The swap token gets in the way of swapout... */ - if (!priority) - disable_swap_token(NULL); - all_zones_ok = 1; balanced = 0; -- cgit v1.2.3-70-g09d2 From c53919adc045bf803252e912f23028a68525753d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 29 May 2012 15:06:19 -0700 Subject: mm: vmscan: remove lumpy reclaim This series removes lumpy reclaim and some stalling logic that was unintentionally being used by memory compaction. The end result is that stalling on dirty pages during page reclaim now depends on wait_iff_congested(). Four kernels were compared 3.3.0 vanilla 3.4.0-rc2 vanilla 3.4.0-rc2 lumpyremove-v2 is patch one from this series 3.4.0-rc2 nosync-v2r3 is the full series Removing lumpy reclaim saves almost 900 bytes of text whereas the full series removes 1200 bytes. text data bss dec hex filename 6740375 1927944 2260992 10929311 a6c49f vmlinux-3.4.0-rc2-vanilla 6739479 1927944 2260992 10928415 a6c11f vmlinux-3.4.0-rc2-lumpyremove-v2 6739159 1927944 2260992 10928095 a6bfdf vmlinux-3.4.0-rc2-nosync-v2 There are behaviour changes in the series and so tests were run with monitoring of ftrace events. This disrupts results so the performance results are distorted but the new behaviour should be clearer. fs-mark running in a threaded configuration showed little of interest as it did not push reclaim aggressively FS-Mark Multi Threaded 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 Files/s min 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) Files/s mean 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) Files/s stddev 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Files/s max 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) Overhead min 508667.00 ( 0.00%) 521350.00 (-2.49%) 544292.00 (-7.00%) 547168.00 (-7.57%) Overhead mean 551185.00 ( 0.00%) 652690.73 (-18.42%) 991208.40 (-79.83%) 570130.53 (-3.44%) Overhead stddev 18200.69 ( 0.00%) 331958.29 (-1723.88%) 1579579.43 (-8578.68%) 9576.81 (47.38%) Overhead max 576775.00 ( 0.00%) 1846634.00 (-220.17%) 6901055.00 (-1096.49%) 585675.00 (-1.54%) MMTests Statistics: duration Sys Time Running Test (seconds) 309.90 300.95 307.33 298.95 User+Sys Time Running Test (seconds) 319.32 309.67 315.69 307.51 Total Elapsed Time (seconds) 1187.85 1193.09 1191.98 1193.73 MMTests Statistics: vmstat Page Ins 80532 82212 81420 79480 Page Outs 111434984 111456240 111437376 111582628 Swap Ins 0 0 0 0 Swap Outs 0 0 0 0 Direct pages scanned 44881 27889 27453 34843 Kswapd pages scanned 25841428 25860774 25861233 25843212 Kswapd pages reclaimed 25841393 25860741 25861199 25843179 Direct pages reclaimed 44881 27889 27453 34843 Kswapd efficiency 99% 99% 99% 99% Kswapd velocity 21754.791 21675.460 21696.029 21649.127 Direct efficiency 100% 100% 100% 100% Direct velocity 37.783 23.375 23.031 29.188 Percentage direct scans 0% 0% 0% 0% ftrace showed that there was no stalling on writeback or pages submitted for IO from reclaim context. postmark was similar and while it was more interesting, it also did not push reclaim heavily. POSTMARK 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 Transactions per second: 16.00 ( 0.00%) 20.00 (25.00%) 18.00 (12.50%) 17.00 ( 6.25%) Data megabytes read per second: 18.80 ( 0.00%) 24.27 (29.10%) 22.26 (18.40%) 20.54 ( 9.26%) Data megabytes written per second: 35.83 ( 0.00%) 46.25 (29.08%) 42.42 (18.39%) 39.14 ( 9.24%) Files created alone per second: 28.00 ( 0.00%) 38.00 (35.71%) 34.00 (21.43%) 30.00 ( 7.14%) Files create/transact per second: 8.00 ( 0.00%) 10.00 (25.00%) 9.00 (12.50%) 8.00 ( 0.00%) Files deleted alone per second: 556.00 ( 0.00%) 1224.00 (120.14%) 3062.00 (450.72%) 6124.00 (1001.44%) Files delete/transact per second: 8.00 ( 0.00%) 10.00 (25.00%) 9.00 (12.50%) 8.00 ( 0.00%) MMTests Statistics: duration Sys Time Running Test (seconds) 113.34 107.99 109.73 108.72 User+Sys Time Running Test (seconds) 145.51 139.81 143.32 143.55 Total Elapsed Time (seconds) 1159.16 899.23 980.17 1062.27 MMTests Statistics: vmstat Page Ins 13710192 13729032 13727944 13760136 Page Outs 43071140 42987228 42733684 42931624 Swap Ins 0 0 0 0 Swap Outs 0 0 0 0 Direct pages scanned 0 0 0 0 Kswapd pages scanned 9941613 9937443 9939085 9929154 Kswapd pages reclaimed 9940926 9936751 9938397 9928465 Direct pages reclaimed 0 0 0 0 Kswapd efficiency 99% 99% 99% 99% Kswapd velocity 8576.567 11051.058 10140.164 9347.109 Direct efficiency 100% 100% 100% 100% Direct velocity 0.000 0.000 0.000 0.000 It looks like here that the full series regresses performance but as ftrace showed no usage of wait_iff_congested() or sync reclaim I am assuming it's a disruption due to monitoring. Other data such as memory usage, page IO, swap IO all looked similar. Running a benchmark with a plain DD showed nothing very interesting. The full series stalled in wait_iff_congested() slightly less but stall times on vanilla kernels were marginal. Running a benchmark that hammered on file-backed mappings showed stalls due to congestion but not in sync writebacks MICRO 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 MMTests Statistics: duration Sys Time Running Test (seconds) 308.13 294.50 298.75 299.53 User+Sys Time Running Test (seconds) 330.45 316.28 318.93 320.79 Total Elapsed Time (seconds) 1814.90 1833.88 1821.14 1832.91 MMTests Statistics: vmstat Page Ins 108712 120708 97224 110344 Page Outs 155514576 156017404 155813676 156193256 Swap Ins 0 0 0 0 Swap Outs 0 0 0 0 Direct pages scanned 2599253 1550480 2512822 2414760 Kswapd pages scanned 69742364 71150694 68839041 69692533 Kswapd pages reclaimed 34824488 34773341 34796602 34799396 Direct pages reclaimed 53693 94750 61792 75205 Kswapd efficiency 49% 48% 50% 49% Kswapd velocity 38427.662 38797.901 37799.972 38022.889 Direct efficiency 2% 6% 2% 3% Direct velocity 1432.174 845.464 1379.807 1317.446 Percentage direct scans 3% 2% 3% 3% Page writes by reclaim 0 0 0 0 Page writes file 0 0 0 0 Page writes anon 0 0 0 0 Page reclaim immediate 0 0 0 1218 Page rescued immediate 0 0 0 0 Slabs scanned 15360 16384 13312 16384 Direct inode steals 0 0 0 0 Kswapd inode steals 4340 4327 1630 4323 FTrace Reclaim Statistics: congestion_wait Direct number congest waited 0 0 0 0 Direct time congest waited 0ms 0ms 0ms 0ms Direct full congest waited 0 0 0 0 Direct number conditional waited 900 870 754 789 Direct time conditional waited 0ms 0ms 0ms 20ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 2106 2308 2116 1915 KSwapd time congest waited 139924ms 157832ms 125652ms 132516ms KSwapd full congest waited 1346 1530 1202 1278 KSwapd number conditional waited 12922 16320 10943 14670 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 Reclaim statistics are not radically changed. The stall times in kswapd are massive but it is clear that it is due to calls to congestion_wait() and that is almost certainly the call in balance_pgdat(). Otherwise stalls due to dirty pages are non-existant. I ran a benchmark that stressed high-order allocation. This is very artifical load but was used in the past to evaluate lumpy reclaim and compaction. Generally I look at allocation success rates and latency figures. STRESS-HIGHALLOC 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 Pass 1 81.00 ( 0.00%) 28.00 (-53.00%) 24.00 (-57.00%) 28.00 (-53.00%) Pass 2 82.00 ( 0.00%) 39.00 (-43.00%) 38.00 (-44.00%) 43.00 (-39.00%) while Rested 88.00 ( 0.00%) 87.00 (-1.00%) 88.00 ( 0.00%) 88.00 ( 0.00%) MMTests Statistics: duration Sys Time Running Test (seconds) 740.93 681.42 685.14 684.87 User+Sys Time Running Test (seconds) 2922.65 3269.52 3281.35 3279.44 Total Elapsed Time (seconds) 1161.73 1152.49 1159.55 1161.44 MMTests Statistics: vmstat Page Ins 4486020 2807256 2855944 2876244 Page Outs 7261600 7973688 7975320 7986120 Swap Ins 31694 0 0 0 Swap Outs 98179 0 0 0 Direct pages scanned 53494 57731 34406 113015 Kswapd pages scanned 6271173 1287481 1278174 1219095 Kswapd pages reclaimed 2029240 1281025 1260708 1201583 Direct pages reclaimed 1468 14564 16649 92456 Kswapd efficiency 32% 99% 98% 98% Kswapd velocity 5398.133 1117.130 1102.302 1049.641 Direct efficiency 2% 25% 48% 81% Direct velocity 46.047 50.092 29.672 97.306 Percentage direct scans 0% 4% 2% 8% Page writes by reclaim 1616049 0 0 0 Page writes file 1517870 0 0 0 Page writes anon 98179 0 0 0 Page reclaim immediate 103778 27339 9796 17831 Page rescued immediate 0 0 0 0 Slabs scanned 1096704 986112 980992 998400 Direct inode steals 223 215040 216736 247881 Kswapd inode steals 175331 61548 68444 63066 Kswapd skipped wait 21991 0 1 0 THP fault alloc 1 135 125 134 THP collapse alloc 393 311 228 236 THP splits 25 13 7 8 THP fault fallback 0 0 0 0 THP collapse fail 3 5 7 7 Compaction stalls 865 1270 1422 1518 Compaction success 370 401 353 383 Compaction failures 495 869 1069 1135 Compaction pages moved 870155 3828868 4036106 4423626 Compaction move failure 26429 23865 29742 27514 Success rates are completely hosed for 3.4-rc2 which is almost certainly due to commit fe2c2a106663 ("vmscan: reclaim at order 0 when compaction is enabled"). I expected this would happen for kswapd and impair allocation success rates (https://lkml.org/lkml/2012/1/25/166) but I did not anticipate this much a difference: 80% less scanning, 37% less reclaim by kswapd In comparison, reclaim/compaction is not aggressive and gives up easily which is the intended behaviour. hugetlbfs uses __GFP_REPEAT and would be much more aggressive about reclaim/compaction than THP allocations are. The stress test above is allocating like neither THP or hugetlbfs but is much closer to THP. Mainline is now impaired in terms of high order allocation under heavy load although I do not know to what degree as I did not test with __GFP_REPEAT. Keep this in mind for bugs related to hugepage pool resizing, THP allocation and high order atomic allocation failures from network devices. In terms of congestion throttling, I see the following for this test FTrace Reclaim Statistics: congestion_wait Direct number congest waited 3 0 0 0 Direct time congest waited 0ms 0ms 0ms 0ms Direct full congest waited 0 0 0 0 Direct number conditional waited 957 512 1081 1075 Direct time conditional waited 0ms 0ms 0ms 0ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 36 4 3 5 KSwapd time congest waited 3148ms 400ms 300ms 500ms KSwapd full congest waited 30 4 3 5 KSwapd number conditional waited 88514 197 332 542 KSwapd time conditional waited 4980ms 0ms 0ms 0ms KSwapd full conditional waited 49 0 0 0 The "conditional waited" times are the most interesting as this is directly impacted by the number of dirty pages encountered during scan. As lumpy reclaim is no longer scanning contiguous ranges, it is finding fewer dirty pages. This brings wait times from about 5 seconds to 0. kswapd itself is still calling congestion_wait() so it'll still stall but it's a lot less. In terms of the type of IO we were doing, I see this FTrace Reclaim Statistics: mm_vmscan_writepage Direct writes anon sync 0 0 0 0 Direct writes anon async 0 0 0 0 Direct writes file sync 0 0 0 0 Direct writes file async 0 0 0 0 Direct writes mixed sync 0 0 0 0 Direct writes mixed async 0 0 0 0 KSwapd writes anon sync 0 0 0 0 KSwapd writes anon async 91682 0 0 0 KSwapd writes file sync 0 0 0 0 KSwapd writes file async 822629 0 0 0 KSwapd writes mixed sync 0 0 0 0 KSwapd writes mixed async 0 0 0 0 In 3.2, kswapd was doing a bunch of async writes of pages but reclaim/compaction was never reaching a point where it was doing sync IO. This does not guarantee that reclaim/compaction was not calling wait_on_page_writeback() but I would consider it unlikely. It indicates that merging patches 2 and 3 to stop reclaim/compaction calling wait_on_page_writeback() should be safe. This patch: Lumpy reclaim had a purpose but in the mind of some, it was to kick the system so hard it trashed. For others the purpose was to complicate vmscan.c. Over time it was giving softer shoes and a nicer attitude but memory compaction needs to step up and replace it so this patch sends lumpy reclaim to the farm. The tracepoint format changes for isolating LRU pages with this patch applied. Furthermore reclaim/compaction can no longer queue dirty pages in pageout() if the underlying BDI is congested. Lumpy reclaim used this logic and reclaim/compaction was using it in error. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: KOSAKI Motohiro Cc: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Ying Han Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 26 ++------ mm/vmscan.c | 144 +++++------------------------------------- 2 files changed, 19 insertions(+), 151 deletions(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 572195459d5..bdaf32f8a87 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -263,22 +263,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_taken, - unsigned long nr_lumpy_taken, - unsigned long nr_lumpy_dirty, - unsigned long nr_lumpy_failed, isolate_mode_t isolate_mode, int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file), + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), TP_STRUCT__entry( __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) __field(unsigned long, nr_taken) - __field(unsigned long, nr_lumpy_taken) - __field(unsigned long, nr_lumpy_dirty) - __field(unsigned long, nr_lumpy_failed) __field(isolate_mode_t, isolate_mode) __field(int, file) ), @@ -288,22 +282,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; __entry->nr_taken = nr_taken; - __entry->nr_lumpy_taken = nr_lumpy_taken; - __entry->nr_lumpy_dirty = nr_lumpy_dirty; - __entry->nr_lumpy_failed = nr_lumpy_failed; __entry->isolate_mode = isolate_mode; __entry->file = file; ), - TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d", + TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", __entry->isolate_mode, __entry->order, __entry->nr_requested, __entry->nr_scanned, __entry->nr_taken, - __entry->nr_lumpy_taken, - __entry->nr_lumpy_dirty, - __entry->nr_lumpy_failed, __entry->file) ); @@ -313,13 +301,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_taken, - unsigned long nr_lumpy_taken, - unsigned long nr_lumpy_dirty, - unsigned long nr_lumpy_failed, isolate_mode_t isolate_mode, int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) ); @@ -329,13 +314,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_taken, - unsigned long nr_lumpy_taken, - unsigned long nr_lumpy_dirty, - unsigned long nr_lumpy_failed, isolate_mode_t isolate_mode, int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index ca46080bb07..546d02ce90e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -58,9 +58,6 @@ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages * RECLAIM_MODE_ASYNC: Do not block * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback - * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference - * page from the LRU and reclaim all pages within a - * naturally aligned range * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of * order-0 pages and then compact the zone */ @@ -68,7 +65,6 @@ typedef unsigned __bitwise__ reclaim_mode_t; #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) -#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { @@ -367,27 +363,17 @@ out: static void set_reclaim_mode(int priority, struct scan_control *sc, bool sync) { + /* Sync reclaim used only for compaction */ reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; /* - * Initially assume we are entering either lumpy reclaim or - * reclaim/compaction.Depending on the order, we will either set the - * sync mode or just reclaim order-0 pages later. - */ - if (COMPACTION_BUILD) - sc->reclaim_mode = RECLAIM_MODE_COMPACTION; - else - sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; - - /* - * Avoid using lumpy reclaim or reclaim/compaction if possible by - * restricting when its set to either costly allocations or when + * Restrict reclaim/compaction to costly allocations or when * under memory pressure */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->reclaim_mode |= syncmode; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->reclaim_mode |= syncmode; + if (COMPACTION_BUILD && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + priority < DEF_PRIORITY - 2)) + sc->reclaim_mode = RECLAIM_MODE_COMPACTION | syncmode; else sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } @@ -416,10 +402,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, return 1; if (bdi == current->backing_dev_info) return 1; - - /* lumpy reclaim for hugepage often need a lot of write */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - return 1; return 0; } @@ -710,10 +692,6 @@ static enum page_references page_check_references(struct page *page, referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); - /* Lumpy reclaim - ignore references */ - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - return PAGEREF_RECLAIM; - /* * Mlock lost the isolation race with us. Let try_to_unmap() * move the page to the unevictable list. @@ -824,7 +802,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, wait_on_page_writeback(page); else { unlock_page(page); - goto keep_lumpy; + goto keep_reclaim_mode; } } @@ -908,7 +886,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case PAGE_SUCCESS: if (PageWriteback(page)) - goto keep_lumpy; + goto keep_reclaim_mode; if (PageDirty(page)) goto keep; @@ -1008,7 +986,7 @@ keep_locked: unlock_page(page); keep: reset_reclaim_mode(sc); -keep_lumpy: +keep_reclaim_mode: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } @@ -1064,11 +1042,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) if (!all_lru_mode && !!page_is_file_cache(page) != file) return ret; - /* - * When this function is being called for lumpy reclaim, we - * initially look into all LRU pages, active, inactive and - * unevictable; only give shrink_page_list evictable pages. - */ + /* Do not give back unevictable pages for compaction */ if (PageUnevictable(page)) return ret; @@ -1153,9 +1127,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec; struct list_head *src; unsigned long nr_taken = 0; - unsigned long nr_lumpy_taken = 0; - unsigned long nr_lumpy_dirty = 0; - unsigned long nr_lumpy_failed = 0; unsigned long scan; int lru = LRU_BASE; @@ -1168,10 +1139,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; - unsigned long pfn; - unsigned long end_pfn; - unsigned long page_pfn; - int zone_id; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); @@ -1193,84 +1160,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, default: BUG(); } - - if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) - continue; - - /* - * Attempt to take all pages in the order aligned region - * surrounding the tag page. Only take those pages of - * the same active state as that tag page. We may safely - * round the target page pfn down to the requested order - * as the mem_map is guaranteed valid out to MAX_ORDER, - * where that page is in a different zone we will detect - * it from its zone id and abort this block scan. - */ - zone_id = page_zone_id(page); - page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << sc->order) - 1); - end_pfn = pfn + (1 << sc->order); - for (; pfn < end_pfn; pfn++) { - struct page *cursor_page; - - /* The target page is in the block, ignore it. */ - if (unlikely(pfn == page_pfn)) - continue; - - /* Avoid holes within the zone. */ - if (unlikely(!pfn_valid_within(pfn))) - break; - - cursor_page = pfn_to_page(pfn); - - /* Check that we have not crossed a zone boundary. */ - if (unlikely(page_zone_id(cursor_page) != zone_id)) - break; - - /* - * If we don't have enough swap space, reclaiming of - * anon page which don't already have a swap slot is - * pointless. - */ - if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && - !PageSwapCache(cursor_page)) - break; - - if (__isolate_lru_page(cursor_page, mode, file) == 0) { - unsigned int isolated_pages; - - mem_cgroup_lru_del(cursor_page); - list_move(&cursor_page->lru, dst); - isolated_pages = hpage_nr_pages(cursor_page); - nr_taken += isolated_pages; - nr_lumpy_taken += isolated_pages; - if (PageDirty(cursor_page)) - nr_lumpy_dirty += isolated_pages; - scan++; - pfn += isolated_pages - 1; - } else { - /* - * Check if the page is freed already. - * - * We can't use page_count() as that - * requires compound_head and we don't - * have a pin on the page here. If a - * page is tail, we may or may not - * have isolated the head, so assume - * it's not free, it'd be tricky to - * track the head status without a - * page pin. - */ - if (!PageTail(cursor_page) && - !atomic_read(&cursor_page->_count)) - continue; - break; - } - } - - /* If we break out of the loop above, lumpy reclaim failed */ - if (pfn < end_pfn) - nr_lumpy_failed++; } *nr_scanned = scan; @@ -1278,7 +1167,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, nr_taken, - nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, mode, file); return nr_taken; } @@ -1466,13 +1354,13 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, int priority, struct scan_control *sc) { - int lumpy_stall_priority; + int stall_priority; /* kswapd should not stall on sync IO */ if (current_is_kswapd()) return false; - /* Only stall on lumpy reclaim */ + /* Only stall for memory compaction */ if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; @@ -1487,11 +1375,11 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, * priority to be much higher before stalling. */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_stall_priority = DEF_PRIORITY; + stall_priority = DEF_PRIORITY; else - lumpy_stall_priority = DEF_PRIORITY / 3; + stall_priority = DEF_PRIORITY / 3; - return priority <= lumpy_stall_priority; + return priority <= stall_priority; } /* @@ -1523,8 +1411,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, } set_reclaim_mode(priority, sc, false); - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - isolate_mode |= ISOLATE_ACTIVE; lru_add_drain(); -- cgit v1.2.3-70-g09d2 From 41ac1999c3e3563e1810b14878a869c79c9368bb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 29 May 2012 15:06:19 -0700 Subject: mm: vmscan: do not stall on writeback during memory compaction This patch stops reclaim/compaction entering sync reclaim as this was only intended for lumpy reclaim and an oversight. Page migration has its own logic for stalling on writeback pages if necessary and memory compaction is already using it. Waiting on page writeback is bad for a number of reasons but the primary one is that waiting on writeback to a slow device like USB can take a considerable length of time. Page reclaim instead uses wait_iff_congested() to throttle if too many dirty pages are being scanned. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: KOSAKI Motohiro Cc: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Ying Han Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 12 +++--- mm/vmscan.c | 85 ++++--------------------------------------- 2 files changed, 14 insertions(+), 83 deletions(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index bdaf32f8a87..82f693395ac 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -13,7 +13,7 @@ #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u #define RECLAIM_WB_MIXED 0x0010u -#define RECLAIM_WB_SYNC 0x0004u +#define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u #define show_reclaim_flags(flags) \ @@ -27,13 +27,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (RECLAIM_WB_ASYNC) \ ) -#define trace_shrink_flags(file, sync) ( \ - (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \ - (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ +#define trace_shrink_flags(file, sync) \ + ( \ + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ + (RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index 546d02ce90e..e27f27d4cc1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,15 +56,11 @@ /* * reclaim_mode determines how the inactive list is shrunk * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages - * RECLAIM_MODE_ASYNC: Do not block - * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of * order-0 pages and then compact the zone */ typedef unsigned __bitwise__ reclaim_mode_t; #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) -#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) -#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { @@ -360,12 +356,8 @@ out: return ret; } -static void set_reclaim_mode(int priority, struct scan_control *sc, - bool sync) +static void set_reclaim_mode(int priority, struct scan_control *sc) { - /* Sync reclaim used only for compaction */ - reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; - /* * Restrict reclaim/compaction to costly allocations or when * under memory pressure @@ -373,14 +365,14 @@ static void set_reclaim_mode(int priority, struct scan_control *sc, if (COMPACTION_BUILD && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || priority < DEF_PRIORITY - 2)) - sc->reclaim_mode = RECLAIM_MODE_COMPACTION | syncmode; + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; else - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; + sc->reclaim_mode = RECLAIM_MODE_SINGLE; } static void reset_reclaim_mode(struct scan_control *sc) { - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; + sc->reclaim_mode = RECLAIM_MODE_SINGLE; } static inline int is_page_cache_freeable(struct page *page) @@ -791,19 +783,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageWriteback(page)) { nr_writeback++; - /* - * Synchronous reclaim cannot queue pages for - * writeback due to the possibility of stack overflow - * but if it encounters a page under writeback, wait - * for the IO to complete. - */ - if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && - may_enter_fs) - wait_on_page_writeback(page); - else { - unlock_page(page); - goto keep_reclaim_mode; - } + unlock_page(page); + goto keep; } references = page_check_references(page, mz, sc); @@ -886,7 +867,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case PAGE_SUCCESS: if (PageWriteback(page)) - goto keep_reclaim_mode; + goto keep; if (PageDirty(page)) goto keep; @@ -985,8 +966,6 @@ activate_locked: keep_locked: unlock_page(page); keep: - reset_reclaim_mode(sc); -keep_reclaim_mode: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } @@ -1341,47 +1320,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, preempt_enable(); } -/* - * Returns true if a direct reclaim should wait on pages under writeback. - * - * If we are direct reclaiming for contiguous pages and we do not reclaim - * everything in the list, try again and wait for writeback IO to complete. - * This will stall high-order allocations noticeably. Only do that when really - * need to free the pages under high memory pressure. - */ -static inline bool should_reclaim_stall(unsigned long nr_taken, - unsigned long nr_freed, - int priority, - struct scan_control *sc) -{ - int stall_priority; - - /* kswapd should not stall on sync IO */ - if (current_is_kswapd()) - return false; - - /* Only stall for memory compaction */ - if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) - return false; - - /* If we have reclaimed everything on the isolated list, no stall */ - if (nr_freed == nr_taken) - return false; - - /* - * For high-order allocations, there are two stall thresholds. - * High-cost allocations stall immediately where as lower - * order allocations such as stacks require the scanning - * priority to be much higher before stalling. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - stall_priority = DEF_PRIORITY; - else - stall_priority = DEF_PRIORITY / 3; - - return priority <= stall_priority; -} - /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages @@ -1410,7 +1348,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, return SWAP_CLUSTER_MAX; } - set_reclaim_mode(priority, sc, false); + set_reclaim_mode(priority, sc); lru_add_drain(); @@ -1442,13 +1380,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); - /* Check if we should syncronously wait for writeback */ - if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, mz, sc, - priority, &nr_dirty, &nr_writeback); - } - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] += nr_anon; -- cgit v1.2.3-70-g09d2 From 23b9da55c5b0feb484bd5e8615f4eb1ce4169453 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 29 May 2012 15:06:20 -0700 Subject: mm: vmscan: remove reclaim_mode_t There is little motiviation for reclaim_mode_t once RECLAIM_MODE_[A]SYNC and lumpy reclaim have been removed. This patch gets rid of reclaim_mode_t as well and improves the documentation about what reclaim/compaction is and when it is triggered. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: KOSAKI Motohiro Cc: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Ying Han Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 4 +-- mm/vmscan.c | 72 +++++++++++++------------------------------ 2 files changed, 24 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 82f693395ac..bab3b87e406 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,12 +25,12 @@ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" -#define trace_reclaim_flags(page, sync) ( \ +#define trace_reclaim_flags(page) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) -#define trace_shrink_flags(file, sync) \ +#define trace_shrink_flags(file) \ ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ diff --git a/mm/vmscan.c b/mm/vmscan.c index e27f27d4cc1..68e5819d0f1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -53,16 +53,6 @@ #define CREATE_TRACE_POINTS #include -/* - * reclaim_mode determines how the inactive list is shrunk - * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages - * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of - * order-0 pages and then compact the zone - */ -typedef unsigned __bitwise__ reclaim_mode_t; -#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) -#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -88,12 +78,6 @@ struct scan_control { int order; - /* - * Intend to reclaim enough continuous memory rather than reclaim - * enough amount of memory. i.e, mode for high order allocation. - */ - reclaim_mode_t reclaim_mode; - /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. @@ -356,25 +340,6 @@ out: return ret; } -static void set_reclaim_mode(int priority, struct scan_control *sc) -{ - /* - * Restrict reclaim/compaction to costly allocations or when - * under memory pressure - */ - if (COMPACTION_BUILD && sc->order && - (sc->order > PAGE_ALLOC_COSTLY_ORDER || - priority < DEF_PRIORITY - 2)) - sc->reclaim_mode = RECLAIM_MODE_COMPACTION; - else - sc->reclaim_mode = RECLAIM_MODE_SINGLE; -} - -static void reset_reclaim_mode(struct scan_control *sc) -{ - sc->reclaim_mode = RECLAIM_MODE_SINGLE; -} - static inline int is_page_cache_freeable(struct page *page) { /* @@ -497,8 +462,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->reclaim_mode)); + trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -953,7 +917,6 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - reset_reclaim_mode(sc); continue; activate_locked: @@ -1348,8 +1311,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, return SWAP_CLUSTER_MAX; } - set_reclaim_mode(priority, sc); - lru_add_drain(); if (!sc->may_unmap) @@ -1433,7 +1394,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, zone_idx(zone), nr_scanned, nr_reclaimed, priority, - trace_shrink_flags(file, sc->reclaim_mode)); + trace_shrink_flags(file)); return nr_reclaimed; } @@ -1512,8 +1473,6 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - reset_reclaim_mode(sc); - if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) @@ -1826,23 +1785,35 @@ out: } } +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(int priority, struct scan_control *sc) +{ + if (COMPACTION_BUILD && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + priority < DEF_PRIORITY - 2)) + return true; + + return false; +} + /* - * Reclaim/compaction depends on a number of pages being freed. To avoid - * disruption to the system, a small number of order-0 pages continue to be - * rotated and reclaimed in the normal fashion. However, by the time we get - * back to the allocator and call try_to_compact_zone(), we ensure that - * there are enough free pages for it to be likely successful + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long nr_reclaimed, unsigned long nr_scanned, + int priority, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; /* If not in reclaim/compaction mode, stop */ - if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + if (!in_reclaim_compaction(priority, sc)) return false; /* Consider stopping depending on scan and reclaim activity */ @@ -1944,7 +1915,8 @@ restart: /* reclaim/compaction might need reclaim to continue */ if (should_continue_reclaim(mz, nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)) + sc->nr_scanned - nr_scanned, + priority, sc)) goto restart; throttle_vm_writeout(sc->gfp_mask); -- cgit v1.2.3-70-g09d2 From 4d67d860531ad5378dedfad7661c540f3365013d Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Tue, 29 May 2012 15:06:21 -0700 Subject: mm: use kcalloc() instead of kzalloc() to allocate array The advantage of kcalloc is, that will prevent integer overflows which could result from the multiplication of number of elements and size and it is also a bit nicer to read. The semantic patch that makes this change is available in https://lkml.org/lkml/2011/11/25/107 Signed-off-by: Thomas Meyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94dff883b44..c28b0b9e5cc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2375,8 +2375,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); - vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); + vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); + vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; -- cgit v1.2.3-70-g09d2 From 841e31e5cc6219d62054788faa289b6ed682d068 Mon Sep 17 00:00:00 2001 From: Rajman Mekaco Date: Tue, 29 May 2012 15:06:21 -0700 Subject: mm/mmap.c: find_vma(): remove unnecessary if(mm) check The "if (mm)" check is not required in find_vma, as the kernel code calls find_vma only when it is absolutely sure that the mm_struct arg to it is non-NULL. Remove the if(mm) check and adding the a WARN_ONCE(!mm) for now. This will serve the purpose of mandating that the execution context(user-mode/kernel-mode) be known before find_vma is called. Also fixed 2 checkpatch.pl errors in the declaration of the rb_node and vma_tmp local variables. I was browsing through the internet and read a discussion at https://lkml.org/lkml/2012/3/27/342 which discusses removal of the validation check within find_vma. Since no-one responded, I decided to send this patch with Andrew's suggestions. [akpm@linux-foundation.org: add remove-me comment] Signed-off-by: Rajman Mekaco Cc: Kautuk Consul Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index e8dcfc7de86..4a9c2a391e2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1639,33 +1639,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; - if (mm) { - /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node * rb_node; - - rb_node = mm->mm_rb.rb_node; - vma = NULL; - - while (rb_node) { - struct vm_area_struct * vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ + return NULL; + + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + struct rb_node *rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct *vma_tmp; + + vma_tmp = rb_entry(rb_node, + struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + if (vma) + mm->mmap_cache = vma; } return vma; } -- cgit v1.2.3-70-g09d2 From bde8bd8a1d5242589ddcaef8e017b48b207c4729 Mon Sep 17 00:00:00 2001 From: Sasikantha babu Date: Tue, 29 May 2012 15:06:22 -0700 Subject: mm/vmstat.c: remove debug fs entries on failure of file creation and made extfrag_debug_root dentry local Remove debug fs files and directory on failure. Since no one is using "extfrag_debug_root" dentry outside of extfrag_debug_init(), make it local to the function. Signed-off-by: Sasikantha babu Acked-by: David Rientjes Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 0dad31dc161..1bbbbd9776a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1223,7 +1223,6 @@ module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) #include -static struct dentry *extfrag_debug_root; /* * Return an index indicating how much of the available free memory is @@ -1361,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = { static int __init extfrag_debug_init(void) { + struct dentry *extfrag_debug_root; + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); if (!extfrag_debug_root) return -ENOMEM; if (!debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, &unusable_file_ops)) - return -ENOMEM; + goto fail; if (!debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, &extfrag_file_ops)) - return -ENOMEM; + goto fail; return 0; +fail: + debugfs_remove_recursive(extfrag_debug_root); + return -ENOMEM; } module_init(extfrag_debug_init); -- cgit v1.2.3-70-g09d2 From 1f1d06c34f7675026326cd9f39ff91e4555cf355 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 29 May 2012 15:06:23 -0700 Subject: thp, memcg: split hugepage for memcg oom on cow On COW, a new hugepage is allocated and charged to the memcg. If the system is oom or the charge to the memcg fails, however, the fault handler will return VM_FAULT_OOM which results in an oom kill. Instead, it's possible to fallback to splitting the hugepage so that the COW results only in an order-0 page being allocated and charged to the memcg which has a higher liklihood to succeed. This is expensive because the hugepage must be split in the page fault handler, but it is much better than unnecessarily oom killing a process. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +++ mm/memory.c | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d7d7165156c..edfeb8cb23d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -952,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) + split_huge_page(page); put_page(page); goto out; } @@ -959,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); + split_huge_page(page); put_page(page); ret |= VM_FAULT_OOM; goto out; diff --git a/mm/memory.c b/mm/memory.c index 2bf9e110437..1b7dc662bf9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); +retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd, flags); } else { pmd_t orig_pmd = *pmd; + int ret; + barrier(); if (pmd_trans_huge(orig_pmd)) { if (flags & FAULT_FLAG_WRITE && !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); + !pmd_trans_splitting(orig_pmd)) { + ret = do_huge_pmd_wp_page(mm, vma, address, pmd, + orig_pmd); + /* + * If COW results in an oom, the huge pmd will + * have been split, so retry the fault on the + * pte for a smaller charge. + */ + if (unlikely(ret & VM_FAULT_OOM)) + goto retry; + return ret; + } return 0; } } -- cgit v1.2.3-70-g09d2 From 4a5b18cc1971046d9ca3a29fdcafbe5648629585 Mon Sep 17 00:00:00 2001 From: Larry Woodman Date: Tue, 29 May 2012 15:06:24 -0700 Subject: mm: do_migrate_pages() calls migrate_to_node() even if task is already on a correct node While running an application that moves tasks from one cpuset to another I noticed that it takes much longer and moves many more pages than expected. The reason for this is do_migrate_pages() does its best to preserve the relative node differential from the first node of the cpuset because the application may have been written with that in mind. If memory was interleaved on the nodes of the source cpuset by an application do_migrate_pages() will try its best to maintain that interleaving on the nodes of the destination cpuset. This means copying the memory from all source nodes to the destination nodes even if the source and destination nodes overlap. This is a problem for userspace NUMA placement tools. The amount of time spent doing extra memory moves cancels out some of the NUMA performance improvements. Furthermore, if the number of source and destination nodes are to maintain the previous interleaving layout anyway. This patch changes do_migrate_pages() to only preserve the relative layout inside the program if the number of NUMA nodes in the source and destination mask are the same. If the number is different, we do a much more efficient migration by not touching memory that is in an allowed node. This preserves the old behaviour for programs that want it, while allowing a userspace NUMA placement tool to use the new, faster migration. This improves performance in our tests by up to a factor of 7. Without this change migrating tasks from a cpuset containing nodes 0-7 to a cpuset containing nodes 3-4, we migrate from ALL the nodes even if they are in the both the source and destination nodesets: Migrating 7 to 4 Migrating 6 to 3 Migrating 5 to 4 Migrating 4 to 3 Migrating 1 to 4 Migrating 3 to 4 Migrating 0 to 3 Migrating 2 to 3 With this change we only migrate from nodes that are not in the destination nodesets: Migrating 7 to 4 Migrating 6 to 3 Migrating 5 to 4 Migrating 2 to 3 Migrating 1 to 4 Migrating 0 to 3 Yet if we move from a cpuset containing nodes 2,3,4 to a cpuset containing 3,4,5 we still do move everything so that we preserve the desired NUMA offsets: Migrating 4 to 5 Migrating 3 to 4 Migrating 2 to 3 As far as performance is concerned this simple patch improves the time it takes to move 14, 20 and 26 large tasks from a cpuset containing nodes 0-7 to a cpuset containing nodes 1 & 3 by up to a factor of 7. Here are the timings with and without the patch: BEFORE PATCH -- Move times: 59, 140, 651 seconds ============ Moving 14 tasks from nodes (0-7) to nodes (1,3) numad(8780) do_migrate_pages (mm=0xffff88081d414400 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x7 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x6 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x5 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x4 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x2 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x1 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 14 tasks...) PID 8890 moved to node(s) 1,3 in 59.2 seconds Moving 20 tasks from nodes (0-7) to nodes (1,4-5) numad(8780) do_migrate_pages (mm=0xffff88081d88c700 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x7 dest=0x4 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x6 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x3 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x2 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x1 dest=0x4 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 20 tasks...) PID 8962 moved to node(s) 1,4-5 in 139.88 seconds Moving 26 tasks from nodes (0-7) to nodes (1-3,5) numad(8780) do_migrate_pages (mm=0xffff88081d5bc740 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x7 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x6 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x5 dest=0x2 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x3 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x2 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x1 dest=0x2 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x0 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x4 dest=0x1 flags=0x4) (Above moves repeated for each of the 26 tasks...) PID 9058 moved to node(s) 1-3,5 in 651.45 seconds AFTER PATCH -- Move times: 42, 56, 93 seconds =========== Moving 14 tasks from nodes (0-7) to nodes (5,7) numad(33209) do_migrate_pages (mm=0xffff88101d5ff140 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x6 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x4 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x3 dest=0x7 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x1 dest=0x7 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x0 dest=0x5 flags=0x4) (Above moves repeated for each of the 14 tasks...) PID 33221 moved to node(s) 5,7 in 41.67 seconds Moving 20 tasks from nodes (0-7) to nodes (1,3,5) numad(33209) do_migrate_pages (mm=0xffff88101d6c37c0 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x7 dest=0x3 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x6 dest=0x1 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x4 dest=0x3 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 20 tasks...) PID 33289 moved to node(s) 1,3,5 in 56.3 seconds Moving 26 tasks from nodes (0-7) to nodes (1,3,5,7) numad(33209) do_migrate_pages (mm=0xffff88101d924400 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x6 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x4 dest=0x1 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 26 tasks...) PID 33372 moved to node(s) 1,3,5,7 in 92.67 seconds [akpm@linux-foundation.org: clean up comment layout] Signed-off-by: Larry Woodman Cc: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Cc: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a388888b6fc..d3c5de47ff6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1005,6 +1005,26 @@ int do_migrate_pages(struct mm_struct *mm, int dest = 0; for_each_node_mask(s, tmp) { + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from_nodes) != nodes_weight(*to_nodes)) && + (node_isset(s, *to_nodes))) + continue; + d = node_remap(s, *from_nodes, *to_nodes); if (s == d) continue; -- cgit v1.2.3-70-g09d2 From 0ce72d4f7333248efbef1f3309770c7edb1b2625 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 29 May 2012 15:06:24 -0700 Subject: mm: do_migrate_pages(): rename arguments s/from_nodes/from and s/to_nodes/to/. The "_nodes" is redundant - it duplicates the argument's type. Done in a fit of irritation over 80-col issues :( Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Larry Woodman Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 9 ++++----- mm/mempolicy.c | 18 +++++++++--------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 7c727a90d70..4aa42732e47 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -225,8 +225,8 @@ static inline void check_highest_zone(enum zone_type k) policy_zone = k; } -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS @@ -354,9 +354,8 @@ static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, return false; } -static inline int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, - const nodemask_t *to_nodes, int flags) +static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d3c5de47ff6..f15c1b24ca1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -950,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * * Returns the number of page that could not be moved. */ -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { int busy = 0; int err; @@ -963,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm, down_read(&mm->mmap_sem); - err = migrate_vmas(mm, from_nodes, to_nodes, flags); + err = migrate_vmas(mm, from, to, flags); if (err) goto out; @@ -998,7 +998,7 @@ int do_migrate_pages(struct mm_struct *mm, * moved to an empty node, then there is nothing left worth migrating. */ - tmp = *from_nodes; + tmp = *from; while (!nodes_empty(tmp)) { int s,d; int source = -1; @@ -1021,11 +1021,11 @@ int do_migrate_pages(struct mm_struct *mm, * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ - if ((nodes_weight(*from_nodes) != nodes_weight(*to_nodes)) && - (node_isset(s, *to_nodes))) + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) continue; - d = node_remap(s, *from_nodes, *to_nodes); + d = node_remap(s, *from, *to); if (s == d) continue; @@ -1085,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, { } -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { return -ENOSYS; } -- cgit v1.2.3-70-g09d2 From 91c63734f6908425903aed69c04035592f18d398 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:24 -0700 Subject: kernel: cgroup: push rcu read locking from css_is_ancestor() to callsite Library functions should not grab locks when the callsites can do it, even if the lock nests like the rcu read-side lock does. Push the rcu_read_lock() from css_is_ancestor() to its single user, mem_cgroup_same_or_subtree() in preparation for another user that may already hold the rcu read-side lock. Signed-off-by: Johannes Weiner Cc: Konstantin Khlebnikov Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 20 ++++++++++---------- mm/memcontrol.c | 14 +++++++++----- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0c6af34d50..0f3527d6184 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5132,7 +5132,7 @@ EXPORT_SYMBOL_GPL(css_depth); * @root: the css supporsed to be an ancestor of the child. * * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * this function reads css->id, the caller must hold rcu_read_lock(). * But, considering usual usage, the csses should be valid objects after test. * Assuming that the caller will do some action to the child if this returns * returns true, the caller must take "child";s reference count. @@ -5144,18 +5144,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, { struct css_id *child_id; struct css_id *root_id; - bool ret = true; - rcu_read_lock(); child_id = rcu_dereference(child->id); + if (!child_id) + return false; root_id = rcu_dereference(root->id); - if (!child_id - || !root_id - || (child_id->depth < root_id->depth) - || (child_id->stack[root_id->depth] != root_id->id)) - ret = false; - rcu_read_unlock(); - return ret; + if (!root_id) + return false; + if (child_id->depth < root_id->depth) + return false; + if (child_id->stack[root_id->depth] != root_id->id) + return false; + return true; } void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92675fe8a2e..faad98e6d17 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1152,12 +1152,16 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg) { - if (root_memcg != memcg) { - return (root_memcg->use_hierarchy && - css_is_ancestor(&memcg->css, &root_memcg->css)); - } + bool ret; - return true; + if (root_memcg == memcg) + return true; + if (!root_memcg->use_hierarchy) + return false; + rcu_read_lock(); + ret = css_is_ancestor(&memcg->css, &root_memcg->css); + rcu_read_unlock(); + return ret; } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) -- cgit v1.2.3-70-g09d2 From c3ac9a8ade65ccbfd145fbff895ae8d8d62d09b0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:25 -0700 Subject: mm: memcg: count pte references from every member of the reclaimed hierarchy The rmap walker checking page table references has historically ignored references from VMAs that were not part of the memcg that was being reclaimed during memcg hard limit reclaim. When transitioning global reclaim to memcg hierarchy reclaim, I missed that bit and now references from outside a memcg are ignored even during global reclaim. Reverting back to traditional behaviour - count all references during global reclaim and only mind references of the memcg being reclaimed during limit reclaim would be one option. However, the more generic idea is to ignore references exactly then when they are outside the hierarchy that is currently under reclaim; because only then will their reclamation be of any use to help the pressure situation. It makes no sense to ignore references from a sibling memcg and then evict a page that will be immediately refaulted by that sibling which contributes to the same usage of the common ancestor under reclaim. The solution: make the rmap walker ignore references from VMAs that are not part of the hierarchy that is being reclaimed. Flat limit reclaim will stay the same, hierarchical limit reclaim will mind the references only to pages that the hierarchy owns. Global reclaim, since it reclaims from all memcgs, will be fixed to regard all references. [akpm@linux-foundation.org: name the args in the declaration] Signed-off-by: Johannes Weiner Reported-by: Konstantin Khlebnikov Acked-by: Konstantin Khlebnikov Cc: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Li Zefan Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++++++- mm/memcontrol.c | 16 +++++++++++----- mm/vmscan.c | 6 ++++-- 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f94efd2f6c2..18ea0b7baf3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -79,6 +79,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order); +bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); @@ -92,10 +94,13 @@ static inline int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) { struct mem_cgroup *memcg; + int match; + rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner)); + match = __mem_cgroup_same_or_subtree(cgroup, memcg); rcu_read_unlock(); - return cgroup == memcg; + return match; } extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index faad98e6d17..4f71219cc53 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1149,17 +1149,23 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree */ -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) +bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) { - bool ret; - if (root_memcg == memcg) return true; if (!root_memcg->use_hierarchy) return false; + return css_is_ancestor(&memcg->css, &root_memcg->css); +} + +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) +{ + bool ret; + rcu_read_lock(); - ret = css_is_ancestor(&memcg->css, &root_memcg->css); + ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); rcu_read_unlock(); return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 68e5819d0f1..8fffc65a84d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -645,7 +645,8 @@ static enum page_references page_check_references(struct page *page, int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, + &vm_flags); referenced_page = TestClearPageReferenced(page); /* @@ -1513,7 +1514,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } } - if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, sc->target_mem_cgroup, + &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and -- cgit v1.2.3-70-g09d2 From 096a7cf44712ab531101bb4689f75f7fcd9b9f18 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 29 May 2012 15:06:25 -0700 Subject: mm: rename is_mlocked_vma() to mlocked_vma_newpage() Andrew pointed out that the is_mlocked_vma() is misnamed. A function with name like that would expect bool return and no side-effects. Since it is called on the fault path for new page, rename it in this patch. Signed-off-by: Ying Han Reviewed-by: Rik van Riel Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim [akpm@linux-foundation.org: s/mlock_vma_newpage/mlock_vma_newpage/, per Minchan] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +++-- mm/vmscan.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index aee4761cf9a..8b0fc8da802 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -164,7 +164,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) * to determine if it's being mapped into a LOCKED vma. * If so, mark page as mlocked. */ -static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +static inline int mlocked_vma_newpage(struct vm_area_struct *vma, + struct page *page) { VM_BUG_ON(PageLRU(page)); @@ -222,7 +223,7 @@ extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) { return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8fffc65a84d..44f04364a30 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3321,7 +3321,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) if (mapping_unevictable(page_mapping(page))) return 0; - if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) return 0; return 1; -- cgit v1.2.3-70-g09d2 From 6f60b69d8cabbf7c0daf879cae4d09ac0776f4b4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 29 May 2012 15:06:26 -0700 Subject: mm, thp: drop page_table_lock to uncharge memcg pages mm->page_table_lock is hotly contested for page fault tests and isn't necessary to do mem_cgroup_uncharge_page() in do_huge_pmd_wp_page(). Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Acked-by: Johannes Weiner Reviewed-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index edfeb8cb23d..d0def42c121 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -973,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { + spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); + goto out; } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); -- cgit v1.2.3-70-g09d2 From 5febcbe99d4766cc383909c447e002e63d8b4592 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Tue, 29 May 2012 15:06:27 -0700 Subject: Cross Memory Attach: make it Kconfigurable Add a Kconfig option to allow people who don't want cross memory attach to not have it included in their build. Signed-off-by: Chris Yeoh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 10 ++++++++++ mm/Makefile | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 39220026c79..b2176374b98 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -349,6 +349,16 @@ choice benefit. endchoice +config CROSS_MEMORY_ATTACH + bool "Cross Memory Support" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to to another process's address space. + See the man page for more details. + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/Makefile b/mm/Makefile index ccecbf9818f..a156285ce88 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,8 +5,11 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o \ - process_vm_access.o + vmalloc.o pagewalk.o pgtable-generic.o + +ifdef CONFIG_CROSS_MEMORY_ATTACH +mmu-$(CONFIG_MMU) += process_vm_access.o +endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ -- cgit v1.2.3-70-g09d2 From a62e2f4f508863da8e0c2f2b42f5252a87330297 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 29 May 2012 15:06:30 -0700 Subject: mm: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. For example: -Zone PFN ranges: +Zone ranges: - DMA32 0x00000010 -> 0x00100000 + DMA32 [mem 0x00010000-0xffffffff] - Normal 0x00100000 -> 0x01080000 + Normal [mem 0x100000000-0x107fffffff] Signed-off-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 14 ++++++++------ mm/page_alloc.c | 19 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc898cb4fe8..0d7e3ec8e0f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->end = start + size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %llx - %llx cannot be added\n", - (unsigned long long)res->start, (unsigned long long)res->end); + printk("System RAM resource %pR cannot be added\n", res); kfree(res); res = NULL; } @@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) online_pages_range); if (ret) { mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages %lx at %lx failed\n", - nr_pages, pfn); + printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) + << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); unlock_memory_hotplug(); return ret; @@ -977,8 +978,9 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining %lx to %lx failed\n", - start_pfn, end_pfn); + printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bab8e3bc420..8d71ad8ffa4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4815,7 +4815,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ - printk("Zone PFN ranges:\n"); + printk("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; @@ -4824,22 +4824,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_highest_possible_pfn[i]) printk(KERN_CONT "empty\n"); else - printk(KERN_CONT "%0#10lx -> %0#10lx\n", - arch_zone_lowest_possible_pfn[i], - arch_zone_highest_possible_pfn[i]); + printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", + arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, + (arch_zone_highest_possible_pfn[i] + << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ - printk("Movable zone start PFN for each node\n"); + printk("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + printk(" Node %d: %#010lx\n", i, + zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early_node_map[] */ - printk("Early memory PFN ranges\n"); + printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); + printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, + start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); -- cgit v1.2.3-70-g09d2 From 955c1cd7401565671b064e499115344ec8067dfd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 29 May 2012 15:06:31 -0700 Subject: mm/page_alloc.c: remove pageblock_default_order() This has always been broken: one version takes an unsigned int and the other version takes no arguments. This bug was hidden because one version of set_pageblock_order() was a macro which doesn't evaluate its argument. Simplify it all and remove pageblock_default_order() altogether. Reported-by: rajman mekaco Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d71ad8ffa4..84f2c599d5d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ - if (HPAGE_SHIFT > PAGE_SHIFT) - return HUGETLB_PAGE_ORDER; - - return MAX_ORDER-1; -} - /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +static inline void __init set_pageblock_order(void) { + unsigned int order; + /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 + * This value may be variable depending on boot parameters on IA64 and + * powerpc. */ pageblock_order = order; } @@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order) /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config */ -static inline int pageblock_default_order(unsigned int order) +static inline void set_pageblock_order(void) { - return MAX_ORDER-1; } -#define set_pageblock_order(x) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ @@ -4422,7 +4419,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - set_pageblock_order(pageblock_default_order()); + set_pageblock_order(); setup_usemap(pgdat, zone, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); -- cgit v1.2.3-70-g09d2 From 6dccdcbe2c3ebe152847ac8507e7bded4e3f4546 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:32 -0700 Subject: mm: bootmem: fix checking the bitmap when finally freeing bootmem When bootmem releases an unaligned chunk of memory at the beginning of a node to the page allocator, it iterates from that unaligned PFN but checks an aligned word of the page bitmap. The checked bits do not correspond to the PFNs and, as a result, reserved pages can be freed. Properly shift the bitmap word so that the lowest bit corresponds to the starting PFN before entering the freeing loop. This bug has been around since commit 41546c17418f ("bootmem: clean up free_all_bootmem_core") (2.6.27) without known reports. Signed-off-by: Gavin Shan Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d5..67872fca97d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -203,6 +203,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } else { unsigned long off = 0; + vec >>= start & (BITS_PER_LONG - 1); while (vec && off < BITS_PER_LONG) { if (vec & 1) { page = pfn_to_page(start + off); -- cgit v1.2.3-70-g09d2 From 549381e19cc7874bb15f0e4760c1004d4fe28d8a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:32 -0700 Subject: mm: bootmem: remove redundant offset check when finally freeing bootmem When bootmem releases an unaligned BITS_PER_LONG pages chunk of memory to the page allocator, it checks the bitmap if there are still unreserved pages in the chunk (set bits), but also if the offset in the chunk indicates BITS_PER_LONG loop iterations already. But since the consulted bitmap is only a one-word-excerpt of the full per-node bitmap, there can not be more than BITS_PER_LONG bits set in it. The additional offset check is unnecessary. Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 67872fca97d..053ac3f1cfe 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -204,7 +204,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long off = 0; vec >>= start & (BITS_PER_LONG - 1); - while (vec && off < BITS_PER_LONG) { + while (vec) { if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); -- cgit v1.2.3-70-g09d2 From c6785b6bf1b2a4b47238b24ee56f61e27c3af682 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:33 -0700 Subject: mm: bootmem: rename alloc_bootmem_core to alloc_bootmem_bdata Callsites need to provide a bootmem_data_t *, make the naming more descriptive. Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 053ac3f1cfe..ceed0df3944 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -468,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata, return ALIGN(base + off, align) - base; } -static void * __init alloc_bootmem_core(struct bootmem_data *bdata, +static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -589,7 +589,7 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); if (p_bdata) - return alloc_bootmem_core(p_bdata, size, align, + return alloc_bootmem_bdata(p_bdata, size, align, goal, limit); } #endif @@ -615,7 +615,7 @@ restart: if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) break; - region = alloc_bootmem_core(bdata, size, align, goal, limit); + region = alloc_bootmem_bdata(bdata, size, align, goal, limit); if (region) return region; } @@ -695,7 +695,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, if (ptr) return ptr; - ptr = alloc_bootmem_core(bdata, size, align, goal, limit); + ptr = alloc_bootmem_bdata(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -744,7 +744,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; - ptr = alloc_bootmem_core(pgdat->bdata, size, align, + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, new_goal, 0); if (ptr) return ptr; @@ -773,7 +773,7 @@ void * __init alloc_bootmem_section(unsigned long size, goal = pfn << PAGE_SHIFT; bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); + return alloc_bootmem_bdata(bdata, size, SMP_CACHE_BYTES, goal, 0); } #endif @@ -789,7 +789,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; -- cgit v1.2.3-70-g09d2 From c12ab504aa6076d1f1d37ee32431608ed11a1c3b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:33 -0700 Subject: mm: bootmem: split out goal-to-node mapping from goal dropping Matching the desired goal to the right node is one thing, dropping the goal when it can not be satisfied is another. Split this into separate functions so that subsequent patches can use the node-finding but drop and handle the goal fallback on their own terms. Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index ceed0df3944..bafeb2c171c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -596,7 +596,7 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, return NULL; } -static void * __init ___alloc_bootmem_nopanic(unsigned long size, +static void * __init alloc_bootmem_core(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -604,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, bootmem_data_t *bdata; void *region; -restart: region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); if (region) return region; @@ -620,6 +619,20 @@ restart: return region; } + return NULL; +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + +restart: + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; if (goal) { goal = 0; goto restart; -- cgit v1.2.3-70-g09d2 From ab3818432294a19ad793a0965d89867b4ce6255b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:34 -0700 Subject: mm: bootmem: allocate in order node+goal, goal, node, anywhere Match the nobootmem version of __alloc_bootmem_node. Try to satisfy both the node and the goal, then just the goal, then just the node, then allocate anywhere before panicking. Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index bafeb2c171c..b5babdfb9ef 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -704,6 +704,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, { void *ptr; +again: ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -712,7 +713,18 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, if (ptr) return ptr; - return ___alloc_bootmem(size, align, goal, limit); + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; + + if (goal) { + goal = 0; + goto again; + } + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } /** -- cgit v1.2.3-70-g09d2 From 421456edd27cf512b8f0025245a0f3572bd69b00 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:34 -0700 Subject: mm: bootmem: unify allocation policy of (non-)panicking node allocations While the panicking node-specific allocation function tries to satisfy node+goal, goal, node, anywhere, the non-panicking function still does node+goal, goal, anywhere. Make it simpler: define the panicking version in terms of the non-panicking one, like the node-agnostic interface, so they always behave the same way apart from how to deal with allocation failure. Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index b5babdfb9ef..d9185c30f3c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, +static void * __init ___alloc_bootmem_node_nopanic(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -722,6 +722,29 @@ again: goto again; } + return NULL; +} + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat->bdata, size, + align, goal, 0); +} + +void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) +{ + void *ptr; + + ptr = ___alloc_bootmem_node_nopanic(bdata, size, align, goal, 0); + if (ptr) + return ptr; + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; @@ -802,25 +825,6 @@ void * __init alloc_bootmem_section(unsigned long size, } #endif -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - return __alloc_bootmem_nopanic(size, align, goal); -} - #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif -- cgit v1.2.3-70-g09d2 From 2c478eae96501163c5c5d5f682bba4d34a7ea1d4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:35 -0700 Subject: mm: nobootmem: panic on node-specific allocation failure __alloc_bootmem_node and __alloc_bootmem_low_node documentation claims the functions panic on allocation failure. Do it. Signed-off-by: Johannes Weiner Acked-by: Yinghai Lu Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 1983fb1c702..cca76207e61 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -305,11 +305,17 @@ again: ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, -1ULL); - if (!ptr && goal) { + if (ptr) + return ptr; + + if (goal) { goal = 0; goto again; } - return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -407,6 +413,12 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - return __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } -- cgit v1.2.3-70-g09d2 From ba539868331874da02017e8dda8ed5b86af6d21a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:35 -0700 Subject: mm: nobootmem: unify allocation policy of (non-)panicking node allocations While the panicking node-specific allocation function tries to satisfy node+goal, goal, node, anywhere, the non-panicking function still does node+goal, goal, anywhere. Make it simpler: define the panicking version in terms of the non-panicking one, like the node-agnostic interface, so they always behave the same way apart from how to deal with allocation failure. Signed-off-by: Johannes Weiner Acked-by: Yinghai Lu Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 106 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 54 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index cca76207e61..9f4048149f6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -274,6 +274,57 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } +static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + +again: + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, limit); + if (ptr) + return ptr; + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, limit); + if (ptr) + return ptr; + + if (goal) { + goal = 0; + goto again; + } + + return NULL; +} + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); +} + +void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) +{ + void *ptr; + + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + /** * __alloc_bootmem_node - allocate boot memory from a specific node * @pgdat: node to allocate from @@ -292,30 +343,10 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -again: - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -346,22 +377,6 @@ void * __init alloc_bootmem_section(unsigned long size, } #endif -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - return __alloc_bootmem_nopanic(size, align, goal); -} - #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif @@ -403,22 +418,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; + return ___alloc_bootmem_node(pgdat, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); } -- cgit v1.2.3-70-g09d2 From e9079911e6c8fb7882de142de76f3ee49b54e000 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:36 -0700 Subject: mm: bootmem: pass pgdat instead of pgdat->bdata down the stack Pass down the node descriptor instead of the more specific bootmem node descriptor down the call stack, like nobootmem does, when there is no good reason for the two to be different. Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index d9185c30f3c..9d0f26664b3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -698,18 +698,19 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(bootmem_data_t *bdata, +static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { void *ptr; again: - ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, + align, goal, limit); if (ptr) return ptr; - ptr = alloc_bootmem_bdata(bdata, size, align, goal, limit); + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); if (ptr) return ptr; @@ -731,17 +732,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node_nopanic(pgdat->bdata, size, - align, goal, 0); + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, +void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { void *ptr; - ptr = ___alloc_bootmem_node_nopanic(bdata, size, align, goal, 0); + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); if (ptr) return ptr; @@ -771,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -869,6 +869,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); } -- cgit v1.2.3-70-g09d2 From 238305bb4d418c95977162ba13c11880685fc731 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:36 -0700 Subject: mm: remove sparsemem allocation details from the bootmem allocator alloc_bootmem_section() derives allocation area constraints from the specified sparsemem section. This is a bit specific for a generic memory allocator like bootmem, though, so move it over to sparsemem. As __alloc_bootmem_node_nopanic() already retries failed allocations with relaxed area constraints, the fallback code in sparsemem.c can be removed and the code becomes a bit more compact overall. [akpm@linux-foundation.org: fix build] Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: David S. Miller Cc: Yinghai Lu Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 3 --- mm/bootmem.c | 22 ---------------------- mm/nobootmem.c | 22 ---------------------- mm/sparse.c | 25 ++++++++++++------------- 4 files changed, 12 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 1a0cd270bb7..324fe08ea3b 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -135,9 +135,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, int flags); -extern void *alloc_bootmem_section(unsigned long size, - unsigned long section_nr); - #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP extern void *alloc_remap(int nid, unsigned long size); #else diff --git a/mm/bootmem.c b/mm/bootmem.c index 9d0f26664b3..d1c7a79d6f3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -803,28 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifdef CONFIG_SPARSEMEM -/** - * alloc_bootmem_section - allocate boot memory from a specific section - * @size: size of the request in bytes - * @section_nr: sparse map section to allocate from - * - * Return NULL on failure. - */ -void * __init alloc_bootmem_section(unsigned long size, - unsigned long section_nr) -{ - bootmem_data_t *bdata; - unsigned long pfn, goal; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - - return alloc_bootmem_bdata(bdata, size, SMP_CACHE_BYTES, goal, 0); -} -#endif - #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 9f4048149f6..d23415c001b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -355,28 +355,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return __alloc_bootmem_node(pgdat, size, align, goal); } -#ifdef CONFIG_SPARSEMEM -/** - * alloc_bootmem_section - allocate boot memory from a specific section - * @size: size of the request in bytes - * @section_nr: sparse map section to allocate from - * - * Return NULL on failure. - */ -void * __init alloc_bootmem_section(unsigned long size, - unsigned long section_nr) -{ - unsigned long pfn, goal, limit; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; - - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); -} -#endif - #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364de..6a4bf9160e8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long count) + unsigned long size) { - unsigned long section_nr; - + pg_data_t *host_pgdat; + unsigned long goal; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while @@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size() * count, section_nr); + goal = __pa(pgdat) & PAGE_SECTION_MASK; + host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); + return __alloc_bootmem_node_nopanic(host_pgdat, size, + SMP_CACHE_BYTES, goal); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) #else static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long count) + unsigned long size) { - return NULL; + return alloc_bootmem_node_nopanic(pgdat, size); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, int size = usemap_size(); usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - usemap_count); + size * usemap_count); if (!usemap) { - usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); - if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); - return; - } + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return; } for (pnum = pnum_begin; pnum < pnum_end; pnum++) { -- cgit v1.2.3-70-g09d2 From 5ceb9ce6fe9462a298bb2cd5c9f1ca6cb80a0199 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 29 May 2012 15:06:37 -0700 Subject: mm: compaction: handle incorrect MIGRATE_UNMOVABLE type pageblocks When MIGRATE_UNMOVABLE pages are freed from MIGRATE_UNMOVABLE type pageblock (and some MIGRATE_MOVABLE pages are left in it) waiting until an allocation takes ownership of the block may take too long. The type of the pageblock remains unchanged so the pageblock cannot be used as a migration target during compaction. Fix it by: * Adding enum compact_mode (COMPACT_ASYNC_[MOVABLE,UNMOVABLE], and COMPACT_SYNC) and then converting sync field in struct compact_control to use it. * Adding nr_pageblocks_skipped field to struct compact_control and tracking how many destination pageblocks were of MIGRATE_UNMOVABLE type. If COMPACT_ASYNC_MOVABLE mode compaction ran fully in try_to_compact_pages() (COMPACT_COMPLETE) it implies that there is not a suitable page for allocation. In this case then check how if there were enough MIGRATE_UNMOVABLE pageblocks to try a second pass in COMPACT_ASYNC_UNMOVABLE mode. * Scanning the MIGRATE_UNMOVABLE pageblocks (during COMPACT_SYNC and COMPACT_ASYNC_UNMOVABLE compaction modes) and building a count based on finding PageBuddy pages, page_count(page) == 0 or PageLRU pages. If all pages within the MIGRATE_UNMOVABLE pageblock are in one of those three sets change the whole pageblock type to MIGRATE_MOVABLE. My particular test case (on a ARM EXYNOS4 device with 512 MiB, which means 131072 standard 4KiB pages in 'Normal' zone) is to: - allocate 120000 pages for kernel's usage - free every second page (60000 pages) of memory just allocated - allocate and use 60000 pages from user space - free remaining 60000 pages of kernel memory (now we have fragmented memory occupied mostly by user space pages) - try to allocate 100 order-9 (2048 KiB) pages for kernel's usage The results: - with compaction disabled I get 11 successful allocations - with compaction enabled - 14 successful allocations - with this patch I'm able to get all 100 successful allocations NOTE: If we can make kswapd aware of order-0 request during compaction, we can enhance kswapd with changing mode to COMPACT_ASYNC_FULL (COMPACT_ASYNC_MOVABLE + COMPACT_ASYNC_UNMOVABLE). Please see the following thread: http://marc.info/?l=linux-mm&m=133552069417068&w=2 [minchan@kernel.org: minor cleanups] Cc: Mel Gorman Cc: Minchan Kim Cc: Rik van Riel Cc: Marek Szyprowski Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 19 ++++++ mm/compaction.c | 142 +++++++++++++++++++++++++++++++++++++-------- mm/internal.h | 9 ++- mm/page_alloc.c | 8 +-- 4 files changed, 150 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 51a90b7f2d6..e988037abd2 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -1,6 +1,8 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H +#include + /* Return values for compact_zone() and try_to_compact_pages() */ /* compaction didn't start as it was not possible or direct reclaim was more suitable */ #define COMPACT_SKIPPED 0 @@ -11,6 +13,23 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +/* + * compaction supports three modes + * + * COMPACT_ASYNC_MOVABLE uses asynchronous migration and only scans + * MIGRATE_MOVABLE pageblocks as migration sources and targets. + * COMPACT_ASYNC_UNMOVABLE uses asynchronous migration and only scans + * MIGRATE_MOVABLE pageblocks as migration sources. + * MIGRATE_UNMOVABLE pageblocks are scanned as potential migration + * targets and convers them to MIGRATE_MOVABLE if possible + * COMPACT_SYNC uses synchronous migration and scans all pageblocks + */ +enum compact_mode { + COMPACT_ASYNC_MOVABLE, + COMPACT_ASYNC_UNMOVABLE, + COMPACT_SYNC, +}; + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, diff --git a/mm/compaction.c b/mm/compaction.c index da7d35ea510..840ee288e29 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -235,7 +235,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode != COMPACT_SYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -303,7 +303,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * satisfies the allocation */ pageblock_nr = low_pfn >> pageblock_order; - if (!cc->sync && last_pageblock_nr != pageblock_nr && + if (cc->mode != COMPACT_SYNC && + last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; @@ -324,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (!cc->sync) + if (cc->mode != COMPACT_SYNC) mode |= ISOLATE_ASYNC_MIGRATE; /* Try isolate the page */ @@ -357,27 +358,90 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION +/* + * Returns true if MIGRATE_UNMOVABLE pageblock was successfully + * converted to MIGRATE_MOVABLE type, false otherwise. + */ +static bool rescue_unmovable_pageblock(struct page *page) +{ + unsigned long pfn, start_pfn, end_pfn; + struct page *start_page, *end_page; + + pfn = page_to_pfn(page); + start_pfn = pfn & ~(pageblock_nr_pages - 1); + end_pfn = start_pfn + pageblock_nr_pages; + + start_page = pfn_to_page(start_pfn); + end_page = pfn_to_page(end_pfn); + + /* Do not deal with pageblocks that overlap zones */ + if (page_zone(start_page) != page_zone(end_page)) + return false; + + for (page = start_page, pfn = start_pfn; page < end_page; pfn++, + page++) { + if (!pfn_valid_within(pfn)) + continue; + + if (PageBuddy(page)) { + int order = page_order(page); + + pfn += (1 << order) - 1; + page += (1 << order) - 1; + + continue; + } else if (page_count(page) == 0 || PageLRU(page)) + continue; + + return false; + } + + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE); + return true; +} -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) +enum smt_result { + GOOD_AS_MIGRATION_TARGET, + FAIL_UNMOVABLE_TARGET, + FAIL_BAD_TARGET, +}; + +/* + * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block + * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page + * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise. + */ +static enum smt_result suitable_migration_target(struct page *page, + struct compact_control *cc) { int migratetype = get_pageblock_migratetype(page); /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; + return FAIL_BAD_TARGET; /* If the page is a large free page, then allow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + return GOOD_AS_MIGRATION_TARGET; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) - return true; + if (cc->mode != COMPACT_ASYNC_UNMOVABLE && + migrate_async_suitable(migratetype)) + return GOOD_AS_MIGRATION_TARGET; + + if (cc->mode == COMPACT_ASYNC_MOVABLE && + migratetype == MIGRATE_UNMOVABLE) + return FAIL_UNMOVABLE_TARGET; + + if (cc->mode != COMPACT_ASYNC_MOVABLE && + migratetype == MIGRATE_UNMOVABLE && + rescue_unmovable_pageblock(page)) + return GOOD_AS_MIGRATION_TARGET; /* Otherwise skip the block */ - return false; + return FAIL_BAD_TARGET; } /* @@ -410,6 +474,13 @@ static void isolate_freepages(struct zone *zone, zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + /* + * isolate_freepages() may be called more than once during + * compact_zone_order() run and we want only the most recent + * count. + */ + cc->nr_pageblocks_skipped = 0; + /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate @@ -418,6 +489,7 @@ static void isolate_freepages(struct zone *zone, for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; + enum smt_result ret; if (!pfn_valid(pfn)) continue; @@ -434,9 +506,12 @@ static void isolate_freepages(struct zone *zone, continue; /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) + ret = suitable_migration_target(page, cc); + if (ret != GOOD_AS_MIGRATION_TARGET) { + if (ret == FAIL_UNMOVABLE_TARGET) + cc->nr_pageblocks_skipped++; continue; - + } /* * Found a block suitable for isolating free pages from. Now * we disabled interrupts, double check things are ok and @@ -445,12 +520,14 @@ static void isolate_freepages(struct zone *zone, */ isolated = 0; spin_lock_irqsave(&zone->lock, flags); - if (suitable_migration_target(page)) { + ret = suitable_migration_target(page, cc); + if (ret == GOOD_AS_MIGRATION_TARGET) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, freelist, false); nr_freepages += isolated; - } + } else if (ret == FAIL_UNMOVABLE_TARGET) + cc->nr_pageblocks_skipped++; spin_unlock_irqrestore(&zone->lock, flags); /* @@ -682,8 +759,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, false, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); + (unsigned long)&cc->freepages, false, + (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT + : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -712,7 +790,8 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync) + enum compact_mode mode, + unsigned long *nr_pageblocks_skipped) { struct compact_control cc = { .nr_freepages = 0, @@ -720,12 +799,17 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; + unsigned long rc; + INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - return compact_zone(zone, &cc); + rc = compact_zone(zone, &cc); + *nr_pageblocks_skipped = cc.nr_pageblocks_skipped; + + return rc; } int sysctl_extfrag_threshold = 500; @@ -750,6 +834,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; + unsigned long nr_pageblocks_skipped; + enum compact_mode mode; /* * Check whether it is worth even starting compaction. The order check is @@ -766,12 +852,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE; +retry: + status = compact_zone_order(zone, order, gfp_mask, mode, + &nr_pageblocks_skipped); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; + + if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) { + if (nr_pageblocks_skipped) { + mode = COMPACT_ASYNC_UNMOVABLE; + goto retry; + } + } } return rc; @@ -805,7 +901,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (ok && cc->order > zone->compact_order_failed) zone->compact_order_failed = cc->order + 1; /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) + else if (!ok && cc->mode == COMPACT_SYNC) defer_compaction(zone, cc->order); } @@ -820,7 +916,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = COMPACT_ASYNC_MOVABLE, }; return __compact_pgdat(pgdat, &cc); @@ -830,7 +926,7 @@ static int compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = COMPACT_SYNC, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index 8b0fc8da802..4194ab9dc19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -94,6 +94,9 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ +extern void set_pageblock_migratetype(struct page *page, int migratetype); +extern int move_freepages_block(struct zone *zone, struct page *page, + int migratetype); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -101,6 +104,7 @@ extern bool is_free_buddy_page(struct page *page); #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA +#include /* * in mm/compaction.c @@ -119,11 +123,14 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum compact_mode mode; /* Compaction mode */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + /* Number of UNMOVABLE destination pageblocks skipped during scan */ + unsigned long nr_pageblocks_skipped; }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84f2c599d5d..457b4de122f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone, return pages_moved; } -static int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -5657,7 +5657,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = COMPACT_SYNC, }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v1.2.3-70-g09d2 From bde05d1ccd512696b09db9dd2e5f33ad19152605 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:38 -0700 Subject: shmem: replace page if mapping excludes its zone The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the backing RAM has to be below 4GB. Not a problem while the boards supported only 4GB: but now Intel's D2700MUD boards support 8GB, and their GMA3600 is managed by the GMA500 driver. shmem/tmpfs has never pretended to support hardware restrictions on the backing memory, but it might have appeared to do so before v3.1, and even now it works fine until a page is swapped out then back in. When read_cache_page_gfp() supplied a freshly allocated page for copy, that compensated for whatever choice might have been made by earlier swapin readahead; but swapoff was likely to destroy the illusion. We'd like to continue to support GMA500, so now add a new shmem_should_replace_page() check on the zone when about to move a page from swapcache to filecache (in swapin and swapoff cases), with shmem_replace_page() to allocate and substitute a suitable page (given gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32). This does involve a minor extension to mem_cgroup_replace_page_cache() (the page may or may not have already been charged); and I've removed a comment and call to mem_cgroup_uncharge_cache_page(), which in fact is always a no-op while PageSwapCache. Also removed optimization of an unlikely path in shmem_getpage_gfp(), now that we need to check PageSwapCache more carefully (a racing caller might already have made the copy). And at one point shmem_unuse_inode() needs to use the hitherto private page_swapcount(), to guard against racing with inode eviction. It would make sense to extend shmem_should_replace_page(), to cover cpuset and NUMA mempolicy restrictions too, but set that aside for now: needs a cleanup of shmem mempolicy handling, and more testing, and ought to handle swap faults in do_swap_page() as well as shmem. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Acked-by: KAMEZAWA Hiroyuki Cc: Alan Cox Cc: Stephane Marchesin Cc: Andi Kleen Cc: Dave Airlie Cc: Daniel Vetter Cc: Rob Clark Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 +++ mm/memcontrol.c | 17 +++++-- mm/shmem.c | 141 ++++++++++++++++++++++++++++++++++++++++++++------- mm/swapfile.c | 2 +- 4 files changed, 142 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index bc3073ce95c..d965c4bfab3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -351,6 +351,7 @@ extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); +extern int page_swapcount(struct page *); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -445,6 +446,11 @@ static inline void delete_from_swap_cache(struct page *page) { } +static inline int page_swapcount(struct page *page) +{ + return 0; +} + #define reuse_swap_page(page) (page_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f71219cc53..d7ce417cae7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3373,7 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; @@ -3383,11 +3383,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, pc = lookup_page_cgroup(oldpage); /* fix accounting on old pages */ lock_page_cgroup(pc); - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, false, -1); - ClearPageCgroupUsed(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, false, -1); + ClearPageCgroupUsed(pc); + } unlock_page_cgroup(pc); + /* + * When called from shmem_replace_page(), in some cases the + * oldpage has already been charged, and in some cases not. + */ + if (!memcg) + return; + if (PageSwapBacked(oldpage)) type = MEM_CGROUP_CHARGE_TYPE_SHMEM; diff --git a/mm/shmem.c b/mm/shmem.c index be5af34a070..db72d8e44ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -103,6 +103,9 @@ static unsigned long shmem_default_max_inodes(void) } #endif +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); @@ -604,12 +607,13 @@ static void shmem_evict_inode(struct inode *inode) * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page *page) + swp_entry_t swap, struct page **pagep) { struct address_space *mapping = info->vfs_inode.i_mapping; void *radswap; pgoff_t index; - int error; + gfp_t gfp; + int error = 0; radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); @@ -625,22 +629,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); + gfp = mapping_gfp_mask(mapping); + if (shmem_should_replace_page(*pagep, gfp)) { + mutex_unlock(&shmem_swaplist_mutex); + error = shmem_replace_page(pagep, gfp, info, index); + mutex_lock(&shmem_swaplist_mutex); + /* + * We needed to drop mutex to make that restrictive page + * allocation; but the inode might already be freed by now, + * and we cannot refer to inode or mapping or info to check. + * However, we do hold page lock on the PageSwapCache page, + * so can check if that still has our reference remaining. + */ + if (!page_swapcount(*pagep)) + error = -ENOENT; + } + /* * We rely on shmem_swaplist_mutex, not only to protect the swaplist, * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = shmem_add_to_page_cache(page, mapping, index, + if (!error) + error = shmem_add_to_page_cache(*pagep, mapping, index, GFP_NOWAIT, radswap); - /* which does mem_cgroup_uncharge_cache_page on error */ - if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which * only does trylock page: if we raced, best clean up here. */ - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(*pagep); + set_page_dirty(*pagep); if (!error) { spin_lock(&info->lock); info->swapped--; @@ -660,7 +679,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; - int error; + int error = 0; + + /* + * There's a faint possibility that swap page was replaced before + * caller locked it: it will come back later with the right page. + */ + if (unlikely(!PageSwapCache(page))) + goto out; /* * Charge page using GFP_KERNEL while we can wait, before taking @@ -676,7 +702,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, page); + found = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); @@ -685,8 +711,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) } mutex_unlock(&shmem_swaplist_mutex); - if (!found) - mem_cgroup_uncharge_cache_page(page); if (found < 0) error = found; out: @@ -855,6 +879,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } #endif +/* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +{ + return page_zonenum(page) > gfp_zone(gfp); +} + +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct page *oldpage, *newpage; + struct address_space *swap_mapping; + pgoff_t swap_index; + int error; + + oldpage = *pagep; + swap_index = page_private(oldpage); + swap_mapping = page_mapping(oldpage); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + newpage = shmem_alloc_page(gfp, info, index); + if (!newpage) + return -ENOMEM; + VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); + + *pagep = newpage; + page_cache_get(newpage); + copy_highpage(newpage, oldpage); + + VM_BUG_ON(!PageLocked(oldpage)); + __set_page_locked(newpage); + VM_BUG_ON(!PageUptodate(oldpage)); + SetPageUptodate(newpage); + VM_BUG_ON(!PageSwapBacked(oldpage)); + SetPageSwapBacked(newpage); + VM_BUG_ON(!swap_index); + set_page_private(newpage, swap_index); + VM_BUG_ON(!PageSwapCache(oldpage)); + SetPageSwapCache(newpage); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + spin_lock_irq(&swap_mapping->tree_lock); + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, + newpage); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + spin_unlock_irq(&swap_mapping->tree_lock); + BUG_ON(error); + + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + + ClearPageSwapCache(oldpage); + set_page_private(oldpage, 0); + + unlock_page(oldpage); + page_cache_release(oldpage); + page_cache_release(oldpage); + return 0; +} + /* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * @@ -923,19 +1025,20 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); + if (!PageSwapCache(page) || page->mapping) { + error = -EEXIST; /* try again */ + goto failed; + } if (!PageUptodate(page)) { error = -EIO; goto failed; } wait_on_page_writeback(page); - /* Someone may have already done it for us */ - if (page->mapping) { - if (page->mapping == mapping && - page->index == index) - goto done; - error = -EEXIST; - goto failed; + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; } error = mem_cgroup_cache_charge(page, current->mm, @@ -998,7 +1101,7 @@ repeat: if (sgp == SGP_DIRTY) set_page_dirty(page); } -done: + /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1d..b0c86e92f42 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static inline int page_swapcount(struct page *page) +int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; -- cgit v1.2.3-70-g09d2 From 2f6e38f3cd17a7858112f538c1700c747170db1f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:38 -0700 Subject: tmpfs: enable NOSEC optimization Let tmpfs into the NOSEC optimization (avoiding file_remove_suid() overhead on most common writes): set MS_NOSEC on its superblocks. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Andi Kleen Cc: Al Viro Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index db72d8e44ec..fe5ae6962ab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2373,6 +2373,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) } } sb->s_export_op = &shmem_export_ops; + sb->s_flags |= MS_NOSEC; #else sb->s_flags |= MS_NOUSER; #endif -- cgit v1.2.3-70-g09d2 From ec9516fbc5fa814014991e1ae7f8860127122105 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:39 -0700 Subject: tmpfs: optimize clearing when writing Nick proposed years ago that tmpfs should avoid clearing its pages where write will overwrite them with new data, as ramfs has long done. But I messed it up and just got bad data. Tried again recently, it works fine. Here's time output for writing 4GiB 16 times on this Core i5 laptop: before: real 0m21.169s user 0m0.028s sys 0m21.057s real 0m21.382s user 0m0.016s sys 0m21.289s real 0m21.311s user 0m0.020s sys 0m21.217s after: real 0m18.273s user 0m0.032s sys 0m18.165s real 0m18.354s user 0m0.020s sys 0m18.265s real 0m18.440s user 0m0.032s sys 0m18.337s ramfs: real 0m16.860s user 0m0.028s sys 0m16.765s real 0m17.382s user 0m0.040s sys 0m17.273s real 0m17.133s user 0m0.044s sys 0m17.021s Yes, I have done perf reports, but they need more explanation than they deserve: in summary, clear_page vanishes, its cache loading shifts into copy_user_generic_unrolled; shmem_getpage_gfp goes down, and surprisingly mark_page_accessed goes way up - I think because they are respectively where the cache gets to be reloaded after being purged by clear or copy. Suggested-by: Nick Piggin Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fe5ae6962ab..45c26476f0f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1095,9 +1095,14 @@ repeat: shmem_recalc_inode(inode); spin_unlock(&info->lock); - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + /* + * Let SGP_WRITE caller clear ends if write does not fill page + */ + if (sgp != SGP_WRITE) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } if (sgp == SGP_DIRTY) set_page_dirty(page); } @@ -1307,6 +1312,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + } set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1768,6 +1781,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); kunmap_atomic(kaddr); + SetPageUptodate(page); set_page_dirty(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.3-70-g09d2 From 83e4fa9c16e4af7122e31be3eca5d57881d236fe Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:40 -0700 Subject: tmpfs: support fallocate FALLOC_FL_PUNCH_HOLE tmpfs has supported hole-punching since 2.6.16, via madvise(,,MADV_REMOVE). But nowadays fallocate(,FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,,) is the agreed way to punch holes. So add shmem_fallocate() to support that, and tweak shmem_truncate_range() to support partial pages at both the beginning and end of range (never needed for madvise, which demands rounded addr and rounds up length). Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 45c26476f0f..7e54ff1c63e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include #include @@ -432,21 +433,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; + unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; pgoff_t index; int i; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + if (lend == -1) + end = -1; /* unsigned, so actually very big */ pagevec_init(&pvec, 0); index = start; - while (index <= end) { + while (index < end) { pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) break; @@ -455,7 +458,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { @@ -479,22 +482,39 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index++; } - if (partial) { + if (partial_start) { struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - zero_user_segment(page, partial, PAGE_CACHE_SIZE); + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + top = partial_end; + partial_end = 0; + } + zero_user_segment(page, partial_start, top); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (partial_end) { + struct page *page = NULL; + shmem_getpage(inode, end, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, 0, partial_end); set_page_dirty(page); unlock_page(page); page_cache_release(page); } } + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { if (index == start) @@ -502,7 +522,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index = start; continue; } - if (index == start && indices[0] > end) { + if (index == start && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -512,7 +532,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { @@ -1578,6 +1598,31 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int error = -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + error = 0; + } + + mutex_unlock(&inode->i_mutex); + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2490,6 +2535,7 @@ static const struct file_operations shmem_file_operations = { .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = shmem_fallocate, #endif }; -- cgit v1.2.3-70-g09d2 From 3f31d07571eeea18a7d34db9af21d2285b807a17 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:40 -0700 Subject: mm/fs: route MADV_REMOVE to FALLOC_FL_PUNCH_HOLE Now tmpfs supports hole-punching via fallocate(), switch madvise_remove() to use do_fallocate() instead of vmtruncate_range(): which extends madvise(,,MADV_REMOVE) support from tmpfs to ext4, ocfs2 and xfs. There is one more user of vmtruncate_range() in our tree, staging/android's ashmem_shrink(): convert it to use do_fallocate() too (but if its unpinned areas are already unmapped - I don't know - then it would do better to use shmem_truncate_range() directly). Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Al Viro Cc: Colin Cross Cc: John Stultz Cc: Greg Kroah-Hartman Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Mark Fasheh Cc: Joel Becker Cc: Dave Chinner Cc: Ben Myers Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/ashmem.c | 8 +++++--- mm/madvise.c | 15 +++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 4511420849b..e84dbecd099 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -363,11 +364,12 @@ static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc) mutex_lock(&ashmem_mutex); list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) { - struct inode *inode = range->asma->file->f_dentry->d_inode; loff_t start = range->pgstart * PAGE_SIZE; - loff_t end = (range->pgend + 1) * PAGE_SIZE - 1; + loff_t end = (range->pgend + 1) * PAGE_SIZE; - vmtruncate_range(inode, start, end); + do_fallocate(range->asma->file, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + start, end - start); range->purged = ASHMEM_WAS_PURGED; lru_del(range); diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b667..deff1b64a08 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,8 +11,10 @@ #include #include #include +#include #include #include +#include /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { - struct address_space *mapping; - loff_t offset, endoff; + loff_t offset; int error; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ @@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) return -EACCES; - mapping = vma->vm_file->f_mapping; - offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - endoff = (loff_t)(end - vma->vm_start - 1) - + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex */ + /* filesystem's fallocate may need to take i_mutex */ up_read(¤t->mm->mmap_sem); - error = vmtruncate_range(mapping->host, offset, endoff); + error = do_fallocate(vma->vm_file, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); down_read(¤t->mm->mmap_sem); return error; } -- cgit v1.2.3-70-g09d2 From 17cf28afea2a1112f240a3a2da8af883be024811 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: mm/fs: remove truncate_range Remove vmtruncate_range(), and remove the truncate_range method from struct inode_operations: only tmpfs ever supported it, and tmpfs has now converted over to using the fallocate method of file_operations. Update Documentation accordingly, adding (setlease and) fallocate lines. And while we're in mm.h, remove duplicate declarations of shmem_lock() and shmem_file_setup(): everyone is now using the ones in shmem_fs.h. Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 -- Documentation/filesystems/vfs.txt | 13 ++++++++----- fs/bad_inode.c | 1 - include/linux/fs.h | 1 - include/linux/mm.h | 4 ---- mm/shmem.c | 1 - mm/truncate.c | 25 ------------------------- 7 files changed, 8 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4fca82e5276..d449e632e6a 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -60,7 +60,6 @@ ata *); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); locking rules: @@ -87,7 +86,6 @@ setxattr: yes getxattr: no listxattr: no removexattr: yes -truncate_range: yes fiemap: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 0d049202808..ef19f91a0f1 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -363,7 +363,6 @@ struct inode_operations { ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); }; Again, all methods are called without any locks being held, unless @@ -472,9 +471,6 @@ otherwise noted. removexattr: called by the VFS to remove an extended attribute from a file. This method is called by removexattr(2) system call. - truncate_range: a method provided by the underlying filesystem to truncate a - range of blocks , i.e. punch a hole somewhere in a file. - The Address Space Object ======================== @@ -760,7 +756,7 @@ struct file_operations ---------------------- This describes how the VFS can manipulate an open file. As of kernel -2.6.22, the following members are defined: +3.5, the following members are defined: struct file_operations { struct module *owner; @@ -790,6 +786,8 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long arg, struct file_lock **); + long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len); }; Again, all methods are called without any locks being held, unless @@ -858,6 +856,11 @@ otherwise noted. splice_read: called by the VFS to splice data from file to a pipe. This method is used by the splice(2) system call + setlease: called by the VFS to set or release a file lock lease. + setlease has the file_lock_lock held and must not sleep. + + fallocate: called by the VFS to preallocate blocks or punch a hole. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 37268c5bb98..1b35d6bd06b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -292,7 +292,6 @@ static const struct inode_operations bad_inode_ops = .getxattr = bad_inode_getxattr, .listxattr = bad_inode_listxattr, .removexattr = bad_inode_removexattr, - /* truncate_range returns void */ }; diff --git a/include/linux/fs.h b/include/linux/fs.h index cdc1a963094..038076b27ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1681,7 +1681,6 @@ struct inode_operations { ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); } ____cacheline_aligned; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d5c37f24c6..aa20bafa40f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -871,8 +871,6 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); -int shmem_lock(struct file *file, int lock, struct user_struct *user); -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); extern int can_do_mlock(void); @@ -951,11 +949,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); extern int vmtruncate(struct inode *inode, loff_t offset); -extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); - int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU diff --git a/mm/shmem.c b/mm/shmem.c index 7e54ff1c63e..f368d0acb52 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2541,7 +2541,6 @@ static const struct file_operations shmem_file_operations = { static const struct inode_operations shmem_inode_operations = { .setattr = shmem_setattr, - .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df..75801acdaac 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize) } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) -{ - struct address_space *mapping = inode->i_mapping; - loff_t holebegin = round_up(lstart, PAGE_SIZE); - loff_t holelen = 1 + lend - holebegin; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - inode_dio_wait(inode); - unmap_mapping_range(mapping, holebegin, holelen, 1); - inode->i_op->truncate_range(inode, lstart, lend); - /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, holebegin, holelen, 1); - mutex_unlock(&inode->i_mutex); - - return 0; -} - /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode -- cgit v1.2.3-70-g09d2 From e2d12e22c59ce714008aa5266d769f8568d74eac Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: tmpfs: support fallocate preallocation The systemd plumbers expressed a wish that tmpfs support preallocation. Cong Wang wrote a patch, but several kernel guys expressed scepticism: https://lkml.org/lkml/2011/11/18/137 Christoph Hellwig: What for exactly? Please explain why preallocating on tmpfs would make any sense. Kay Sievers: To be able to safely use mmap(), regarding SIGBUS, on files on the /dev/shm filesystem. The glibc fallback loop for -ENOSYS [or -EOPNOTSUPP] on fallocate is just ugly. Hugh Dickins: If tmpfs is going to support fallocate(FALLOC_FL_PUNCH_HOLE), it would seem perverse to permit the deallocation but fail the allocation. Christoph Hellwig: Agreed. Now that we do have shmem_fallocate() for hole-punching, plumb in basic support for preallocation mode too. It's fairly straightforward (though quite a few details needed attention), except for when it fails part way through. What a pity that fallocate(2) was not specified to return the length allocated, permitting short fallocations! As it is, when it fails part way through, we ought to free what has just been allocated by this system call; but must be very sure not to free any allocated earlier, or any allocated by racing accesses (not all excluded by i_mutex). But we cannot distinguish them: so in this patch simply leak allocations on partial failure (they will be freed later if the file is removed). An attractive alternative approach would have been for fallocate() not to allocate pages at all, but note reservations by entries in the radix-tree. But that would give less assurance, and, critically, would be hard to fit with mem cgroups (who owns the reservations?): allocating pages lets fallocate() behave in just the same way as write(). Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f368d0acb52..9b90d89e54c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1602,7 +1602,9 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - int error = -EOPNOTSUPP; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + pgoff_t start, index, end; + int error; mutex_lock(&inode->i_mutex); @@ -1617,8 +1619,65 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ error = 0; + goto out; } + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + for (index = start; index < end; index++) { + struct page *page; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else + error = shmem_getpage(inode, index, &page, SGP_WRITE, + NULL); + if (error) { + /* + * We really ought to free what we allocated so far, + * but it would be wrong to free pages allocated + * earlier, or already now in use: i_mutex does not + * exclude all cases. We do not know what to free. + */ + goto ctime; + } + + if (!PageUptodate(page)) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* + * set_page_dirty so that memory pressure will swap rather + * than free the pages we are allocating (and SGP_CACHE pages + * might still be clean: we now need to mark those dirty too). + */ + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); +ctime: + inode->i_ctime = CURRENT_TIME; +out: mutex_unlock(&inode->i_mutex); return error; } -- cgit v1.2.3-70-g09d2 From 1635f6a74152f1dcd1b888231609d64875f0a81a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:42 -0700 Subject: tmpfs: undo fallocation on failure In the previous episode, we left the already-fallocated pages attached to the file when shmem_fallocate() fails part way through. Now try to do better, by extending the earlier optimization of !Uptodate pages (then always under page lock) to !Uptodate pages (outside of page lock), representing fallocated pages. And don't waste time clearing them at the time of fallocate(), leave that until later if necessary. Adapt shmem_truncate_range() to shmem_undo_range(), so that a failing fallocate can recognize and remove precisely those !Uptodate allocations which it added (and were not independently allocated by racing tasks). But unless we start playing with swapfile.c and memcontrol.c too, once one of our fallocated pages reaches shmem_writepage(), we do then have to instantiate it as an ordinarily allocated page, before swapping out. This is unsatisfactory, but improved in the next episode. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 105 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 9b90d89e54c..793dcd1bac8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -89,7 +89,8 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; #ifdef CONFIG_TMPFS @@ -427,8 +428,10 @@ void shmem_unlock_mapping(struct address_space *mapping) /* * Remove range of pages and swap entries from radix tree, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -462,6 +465,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; @@ -469,9 +474,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (!trylock_page(page)) continue; - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -517,12 +524,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { - if (index == start) + if (index == start || unfalloc) break; index = start; continue; } - if (index == start && indices[0] >= end) { + if ((index == start || unfalloc) && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -536,15 +543,19 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; } lock_page(page); - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -558,7 +569,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); spin_unlock(&info->lock); +} +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -771,6 +786,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ goto redirty; } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated page arriving here is now to initialize it and write it. + */ + if (!PageUptodate(page)) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + swap = get_swap_page(); if (!swap.val) goto redirty; @@ -994,6 +1021,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, swp_entry_t swap; int error; int once = 0; + int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; @@ -1005,19 +1033,21 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto failed; } + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) + goto clear; + unlock_page(page); + page_cache_release(page); + page = NULL; + } if (page || (sgp == SGP_READ && !swap.val)) { - /* - * Once we can get the page lock, it must be uptodate: - * if there were an error in reading back from swap, - * the page would not be inserted into the filecache. - */ - BUG_ON(page && !PageUptodate(page)); *pagep = page; return 0; } @@ -1114,9 +1144,18 @@ repeat: inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock(&info->lock); + alloced = true; /* - * Let SGP_WRITE caller clear ends if write does not fill page + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE) { clear_highpage(page); @@ -1128,10 +1167,13 @@ repeat: } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto trunc; + if (alloced) + goto trunc; + else + goto failed; } *pagep = page; return 0; @@ -1140,6 +1182,7 @@ repeat: * Error recovery. */ trunc: + info = SHMEM_I(inode); ClearPageDirty(page); delete_from_page_cache(page); spin_lock(&info->lock); @@ -1147,6 +1190,7 @@ trunc: inode->i_blocks -= BLOCKS_PER_PAGE; spin_unlock(&info->lock); decused: + sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: @@ -1645,25 +1689,20 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (signal_pending(current)) error = -EINTR; else - error = shmem_getpage(inode, index, &page, SGP_WRITE, + error = shmem_getpage(inode, index, &page, SGP_FALLOC, NULL); if (error) { - /* - * We really ought to free what we allocated so far, - * but it would be wrong to free pages allocated - * earlier, or already now in use: i_mutex does not - * exclude all cases. We do not know what to free. - */ + /* Remove the !PageUptodate pages we added */ + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, true); goto ctime; } - if (!PageUptodate(page)) { - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); - } /* - * set_page_dirty so that memory pressure will swap rather + * If !PageUptodate, leave it that way so that freeable pages + * can be recognized if we need to rollback on error later. + * But set_page_dirty so that memory pressure will swap rather * than free the pages we are allocating (and SGP_CACHE pages * might still be clean: we now need to mark those dirty too). */ -- cgit v1.2.3-70-g09d2 From 1aac1400319d30786f32b9290e9cc923937b3d57 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:42 -0700 Subject: tmpfs: quit when fallocate fills memory As it stands, a large fallocate() on tmpfs is liable to fill memory with pages, freed on failure except when they run into swap, at which point they become fixed into the file despite the failure. That feels quite wrong, to be consuming resources precisely when they're in short supply. Go the other way instead: shmem_fallocate() indicate the range it has fallocated to shmem_writepage(), keeping count of pages it's allocating; shmem_writepage() reactivate instead of swapping out pages fallocated by this syscall (but happily swap out those from earlier occasions), keeping count; shmem_fallocate() compare counts and give up once the reactivated pages have started to coming back to writepage (approximately: some zones would in fact recycle faster than others). This is a little unusual, but works well: although we could consider the failure to swap as a bug, and fix it later with SWAP_MAP_FALLOC handling added in swapfile.c and memcontrol.c, I doubt that we shall ever want to. (If there's no swap, an over-large fallocate() on tmpfs is limited in the same way as writing: stopped by rlimit, or by tmpfs mount size if that was set sensibly, or by __vm_enough_memory() heuristics if OVERCOMMIT_GUESS or OVERCOMMIT_NEVER. If OVERCOMMIT_ALWAYS, then it is liable to OOM-kill others as writing would, but stops and frees if interrupted.) Now that everything is freed on failure, we can then skip updating ctime. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 793dcd1bac8..191663a8def 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -84,6 +84,18 @@ struct shmem_xattr { char value[0]; }; +/* + * shmem_fallocate and shmem_writepage communicate via inode->i_private + * (with i_mutex making sure that it has only one user at a time): + * we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ +}; + /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ @@ -791,8 +803,28 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated page arriving here is now to initialize it and write it. + * + * That's okay for a page already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this page in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the page, and let shmem_fallocate() quit when too many. */ if (!PageUptodate(page)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } clear_highpage(page); flush_dcache_page(page); SetPageUptodate(page); @@ -1647,6 +1679,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, { struct inode *inode = file->f_path.dentry->d_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; @@ -1679,6 +1712,14 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + for (index = start; index < end; index++) { struct page *page; @@ -1688,6 +1729,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, */ if (signal_pending(current)) error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; else error = shmem_getpage(inode, index, &page, SGP_FALLOC, NULL); @@ -1696,9 +1739,17 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, shmem_undo_range(inode, (loff_t)start << PAGE_CACHE_SHIFT, (loff_t)index << PAGE_CACHE_SHIFT, true); - goto ctime; + goto undone; } + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + shmem_falloc.next++; + if (!PageUptodate(page)) + shmem_falloc.nr_falloced++; + /* * If !PageUptodate, leave it that way so that freeable pages * can be recognized if we need to rollback on error later. @@ -1714,8 +1765,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); -ctime: inode->i_ctime = CURRENT_TIME; +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); out: mutex_unlock(&inode->i_mutex); return error; -- cgit v1.2.3-70-g09d2 From 4fb5ef089b288942c6fc3f85c4ecb4016c1aa4c3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:43 -0700 Subject: tmpfs: support SEEK_DATA and SEEK_HOLE It's quite easy for tmpfs to scan the radix_tree to support llseek's new SEEK_DATA and SEEK_HOLE options: so add them while the minutiae are still on my mind (in particular, the !PageUptodate-ness of pages fallocated but still unwritten). But I don't know who actually uses SEEK_DATA or SEEK_HOLE, and whether it would be of any use to them on tmpfs. This code adds 92 lines and 752 bytes on x86_64 - is that bloat or worthwhile? [akpm@linux-foundation.org: fix warning with CONFIG_TMPFS=n] Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Josef Bacik Cc: Andi Kleen Cc: Andreas Dilger Cc: Dave Chinner Cc: Marco Stornelli Cc: Jeff liu Cc: Chris Mason Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 191663a8def..d576b84d913 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1674,6 +1674,98 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +/* + * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + */ +static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pgoff_t index, pgoff_t end, int origin) +{ + struct page *page; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool done = false; + int i; + + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (origin == SEEK_DATA) + index = end; + break; + } + for (i = 0; i < pvec.nr; i++, index++) { + if (index < indices[i]) { + if (origin == SEEK_HOLE) { + done = true; + break; + } + index = indices[i]; + } + page = pvec.pages[i]; + if (page && !radix_tree_exceptional_entry(page)) { + if (!PageUptodate(page)) + page = NULL; + } + if (index >= end || + (page && origin == SEEK_DATA) || + (!page && origin == SEEK_HOLE)) { + done = true; + break; + } + } + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); + } + return index; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct address_space *mapping; + struct inode *inode; + pgoff_t start, end; + loff_t new_offset; + + if (origin != SEEK_DATA && origin != SEEK_HOLE) + return generic_file_llseek_size(file, offset, origin, + MAX_LFS_FILESIZE); + mapping = file->f_mapping; + inode = mapping->host; + mutex_lock(&inode->i_mutex); + /* We're holding i_mutex so we can access i_size directly */ + + if (offset < 0) + offset = -EINVAL; + else if (offset >= inode->i_size) + offset = -ENXIO; + else { + start = offset >> PAGE_CACHE_SHIFT; + end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + new_offset = shmem_seek_hole_data(mapping, start, end, origin); + new_offset <<= PAGE_CACHE_SHIFT; + if (new_offset > offset) { + if (new_offset < inode->i_size) + offset = new_offset; + else if (origin == SEEK_DATA) + offset = -ENXIO; + else + offset = inode->i_size; + } + } + + if (offset >= 0 && offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + return offset; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2679,7 +2771,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = generic_file_llseek, + .llseek = shmem_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, -- cgit v1.2.3-70-g09d2 From 782182e53a6cdb3e3d04cc40516e173046942a32 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 29 May 2012 15:06:43 -0700 Subject: mm: move readahead syscall to mm/readahead.c It is better to define readahead(2) in mm/readahead.c than in mm/filemap.c. Signed-off-by: Cong Wang Cc: Fengguang Wu Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 39 --------------------------------------- mm/readahead.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b0b14..64b48f934b8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include /* for BUG_ON(!in_atomic()) only */ #include @@ -1478,44 +1477,6 @@ out: } EXPORT_SYMBOL(generic_file_aio_read); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) - return -EINVAL; - - force_page_cache_readahead(mapping, filp, index, nr); - return 0; -} - -SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) -{ - ssize_t ret; - struct file *file; - - ret = -EBADF; - file = fget(fd); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; - pgoff_t start = offset >> PAGE_CACHE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); - } - fput(file); - } - return ret; -} -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_readahead(long fd, loff_t offset, long count) -{ - return SYSC_readahead((int) fd, offset, (size_t) count); -} -SYSCALL_ALIAS(sys_readahead, SyS_readahead); -#endif - #ifdef CONFIG_MMU /** * page_cache_read - adds requested page to the page cache if not already there diff --git a/mm/readahead.c b/mm/readahead.c index cbcbb02f3e2..ea8f8fa2164 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include /* * Initialise a struct file's readahead state. Assumes that the caller has @@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping, ondemand_readahead(mapping, ra, filp, true, offset, req_size); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + pgoff_t index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + force_page_cache_readahead(mapping, filp, index, nr); + return 0; +} + +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + struct address_space *mapping = file->f_mapping; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, file, start, len); + } + fput(file); + } + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif -- cgit v1.2.3-70-g09d2 From be9cd873e2a706a688e37224d48e135efd8c0d26 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:44 -0700 Subject: mm/buddy: dump PG_compound_lock page flag The array pageflag_names[] does conversion from page flags into their corresponding names so that a meaningful representation of the corresponding page flag can be printed. This mechanism is used while dumping page frames. However, the array missed PG_compound_lock. So the PG_compound_lock page flag would be printed as a digital number instead of a meaningful string. The patch fixes that and prints "compound_lock" for the PG_compound_lock page flag. Signed-off-by: Gavin Shan Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 457b4de122f..72869ea5c51 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5972,6 +5972,9 @@ static struct trace_print_flags pageflag_names[] = { #endif #ifdef CONFIG_MEMORY_FAILURE {1UL << PG_hwpoison, "hwpoison" }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, #endif {-1UL, NULL }, }; -- cgit v1.2.3-70-g09d2 From acc50c110cf6f1a8bd1af410b2c7bc29ca624678 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:44 -0700 Subject: mm: page_alloc: catch out-of-date list of page flag names String tables with names of enum items are always prone to go out of sync with the enums themselves. Ensure during compile time that the name table of page flags has the same size as the page flags enum. Signed-off-by: Johannes Weiner Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 72869ea5c51..5712898c951 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5985,6 +5985,8 @@ static void dump_page_flags(unsigned long flags) unsigned long mask; int i; + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) - 1 != __NR_PAGEFLAGS); + printk(KERN_ALERT "page flags: %#lx(", flags); /* remove zone id */ -- cgit v1.2.3-70-g09d2 From 51300cef41c9355a7d428fe0714888d3c72a6f2f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 29 May 2012 15:06:44 -0700 Subject: mm/page_alloc.c: cleanups - make pageflag_names[] const - remove null termination of pageflag_names[] Cc: Johannes Weiner Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5712898c951..4fc462b5fcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5938,7 +5938,7 @@ bool is_free_buddy_page(struct page *page) } #endif -static struct trace_print_flags pageflag_names[] = { +static const struct trace_print_flags pageflag_names[] = { {1UL << PG_locked, "locked" }, {1UL << PG_error, "error" }, {1UL << PG_referenced, "referenced" }, @@ -5976,7 +5976,6 @@ static struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif - {-1UL, NULL }, }; static void dump_page_flags(unsigned long flags) @@ -5985,14 +5984,14 @@ static void dump_page_flags(unsigned long flags) unsigned long mask; int i; - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) - 1 != __NR_PAGEFLAGS); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); printk(KERN_ALERT "page flags: %#lx(", flags); /* remove zone id */ flags &= (1UL << NR_PAGEFLAGS) - 1; - for (i = 0; pageflag_names[i].name && flags; i++) { + for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { mask = pageflag_names[i].mask; if ((flags & mask) != mask) -- cgit v1.2.3-70-g09d2 From e48982734ea0500d1eba4f9d96195acc5406cad6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 29 May 2012 15:06:45 -0700 Subject: mm: consider all swapped back pages in used-once logic Commit 645747462435 ("vmscan: detect mapped file pages used only once") made mapped pages have another round in inactive list because they might be just short lived and so we could consider them again next time. This heuristic helps to reduce pressure on the active list with a streaming IO worklods. This patch fixes a regression introduced by this commit for heavy shmem based workloads because unlike Anon pages, which are excluded from this heuristic because they are usually long lived, shmem pages are handled as a regular page cache. This doesn't work quite well, unfortunately, if the workload is mostly backed by shmem (in memory database sitting on 80% of memory) with a streaming IO in the background (backup - up to 20% of memory). Anon inactive list is full of (dirty) shmem pages when watermarks are hit. Shmem pages are kept in the inactive list (they are referenced) in the first round and it is hard to reclaim anything else so we reach lower scanning priorities very quickly which leads to an excessive swap out. Let's fix this by excluding all swap backed pages (they tend to be long lived wrt. the regular page cache anyway) from used-once heuristic and rather activate them if they are referenced. The customer's workload is shmem backed database (80% of RAM) and they are measuring transactions/s with an IO in the background (20%). Transactions touch more or less random rows in the table. The transaction rate fell by a factor of 3 (in the worst case) because of commit 64574746. This patch restores the previous numbers. Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Cc: [2.6.34+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 44f04364a30..67a4fd4792d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -657,7 +657,7 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; if (referenced_ptes) { - if (PageAnon(page)) + if (PageSwapBacked(page)) return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table -- cgit v1.2.3-70-g09d2 From 5c2b8a162b5f8616f709bf20d5ec88f709485522 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:46 -0700 Subject: mm/bootmem.c: cleanup on addition to bootmem data list The objects of "struct bootmem_data_t" are linked together to form double-linked list sequentially based on its minimal page frame number. The current implementation implicitly supports the following cases, which means the inserting point for current bootmem data depends on how "list_for_each" works. That makes the code a little hard to read. Besides, "list_for_each" and "list_entry" can be replaced with "list_for_each_entry". - The linked list is empty. - There has no entry in the linked list, whose minimal page frame number is bigger than current one. Signed-off-by: Gavin Shan Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index d1c7a79d6f3..ec4fcb7a56c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) */ static void __init link_bootmem(bootmem_data_t *bdata) { - struct list_head *iter; + bootmem_data_t *ent; - list_for_each(iter, &bdata_list) { - bootmem_data_t *ent; - - ent = list_entry(iter, bootmem_data_t, list); - if (bdata->node_min_pfn < ent->node_min_pfn) - break; + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_min_pfn < ent->node_min_pfn) { + list_add_tail(&bdata->list, &ent->list); + return; + } } - list_add_tail(&bdata->list, iter); + + list_add_tail(&bdata->list, &bdata_list); } /* -- cgit v1.2.3-70-g09d2 From c50ac050811d6485616a193eb0f37bfbd191cc89 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 29 May 2012 15:06:46 -0700 Subject: hugetlb: fix resv_map leak in error path When called for anonymous (non-shared) mappings, hugetlb_reserve_pages() does a resv_map_alloc(). It depends on code in hugetlbfs's vm_ops->close() to release that allocation. However, in the mmap() failure path, we do a plain unmap_region() without the remove_vma() which actually calls vm_ops->close(). This is a decent fix. This leak could get reintroduced if new code (say, after hugetlb_reserve_pages() in hugetlbfs_file_mmap()) decides to return an error. But, I think it would have to unroll the reservation anyway. Christoph's test case: http://marc.info/?l=linux-mm&m=133728900729735 This patch applies to 3.4 and later. A version for earlier kernels is at https://lkml.org/lkml/2012/5/22/418. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: KOSAKI Motohiro Reported-by: Christoph Lameter Tested-by: Christoph Lameter Cc: Andrea Arcangeli Cc: [2.6.32+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41a647dfb73..285a81e87ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) kref_get(&reservations->refs); } +static void resv_map_put(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + if (!reservations) + return; + kref_put(&reservations->refs, resv_map_release); +} + static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); @@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) reserve = (end - start) - region_count(&reservations->regions, start, end); - kref_put(&reservations->refs, resv_map_release); + resv_map_put(vma); if (reserve) { hugetlb_acct_memory(h, -reserve); @@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) - return chg; + if (chg < 0) { + ret = chg; + goto out_err; + } /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) - return -ENOSPC; + if (hugepage_subpool_get_pages(spool, chg)) { + ret = -ENOSPC; + goto out_err; + } /* * Check enough hugepages are available for the reservation. @@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode, ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugepage_subpool_put_pages(spool, chg); - return ret; + goto out_err; } /* @@ -3022,6 +3035,9 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; +out_err: + resv_map_put(vma); + return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) -- cgit v1.2.3-70-g09d2 From fe35004fbf9eaf67482b074a2e032abb9c89b1dd Mon Sep 17 00:00:00 2001 From: Satoru Moriya Date: Tue, 29 May 2012 15:06:47 -0700 Subject: mm: avoid swapping out with swappiness==0 Sometimes we'd like to avoid swapping out anonymous memory. In particular, avoid swapping out pages of important process or process groups while there is a reasonable amount of pagecache on RAM so that we can satisfy our customers' requirements. OTOH, we can control how aggressive the kernel will swap memory pages with /proc/sys/vm/swappiness for global and /sys/fs/cgroup/memory/memory.swappiness for each memcg. But with current reclaim implementation, the kernel may swap out even if we set swappiness=0 and there is pagecache in RAM. This patch changes the behavior with swappiness==0. If we set swappiness==0, the kernel does not swap out completely (for global reclaim until the amount of free pages and filebacked pages in a zone has been reduced to something very very small (nr_free + nr_filebacked < high watermark)). Signed-off-by: Satoru Moriya Acked-by: Minchan Kim Reviewed-by: Rik van Riel Acked-by: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 67a4fd4792d..ee975302877 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1761,10 +1761,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; spin_unlock_irq(&mz->zone->lru_lock); @@ -1777,7 +1777,7 @@ out: unsigned long scan; scan = zone_nr_lru_pages(mz, lru); - if (priority || noswap) { + if (priority || noswap || !vmscan_swappiness(mz, sc)) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; -- cgit v1.2.3-70-g09d2 From a7f638f999ff42310e9582273b1fe25ea6e469ba Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 29 May 2012 15:06:47 -0700 Subject: mm, oom: normalize oom scores to oom_score_adj scale only for userspace The oom_score_adj scale ranges from -1000 to 1000 and represents the proportion of memory available to the process at allocation time. This means an oom_score_adj value of 300, for example, will bias a process as though it was using an extra 30.0% of available memory and a value of -350 will discount 35.0% of available memory from its usage. The oom killer badness heuristic also uses this scale to report the oom score for each eligible process in determining the "best" process to kill. Thus, it can only differentiate each process's memory usage by 0.1% of system RAM. On large systems, this can end up being a large amount of memory: 256MB on 256GB systems, for example. This can be fixed by having the badness heuristic to use the actual memory usage in scoring threads and then normalizing it to the oom_score_adj scale for userspace. This results in better comparison between eligible threads for kill and no change from the userspace perspective. Suggested-by: KOSAKI Motohiro Tested-by: Dave Jones Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- include/linux/oom.h | 5 +++-- mm/oom_kill.c | 44 ++++++++++++++++---------------------------- 3 files changed, 22 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/fs/proc/base.c b/fs/proc/base.c index d2d3108a611..d7d711876b6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -411,12 +411,13 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct task_struct *task, char *buffer) { + unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; read_lock(&tasklist_lock); if (pid_alive(task)) - points = oom_badness(task, NULL, NULL, - totalram_pages + total_swap_pages); + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } diff --git a/include/linux/oom.h b/include/linux/oom.h index 3d7647536b0..e4c29bc72e7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -43,8 +43,9 @@ enum oom_constraint { extern void compare_swap_oom_score_adj(int old_val, int new_val); extern int test_set_oom_score_adj(int new_val); -extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, - const nodemask_t *nodemask, unsigned long totalpages); +extern unsigned long oom_badness(struct task_struct *p, + struct mem_cgroup *memcg, const nodemask_t *nodemask, + unsigned long totalpages); extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9f09a1fde9f..ed0e1967736 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, - const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, + const nodemask_t *nodemask, unsigned long totalpages) { - long points; + unsigned long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -197,22 +197,12 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return 0; } - /* - * The memory controller may have a limit of 0 bytes, so avoid a divide - * by zero, if necessary. - */ - if (!totalpages) - totalpages = 1; - /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes; - points += get_mm_counter(p->mm, MM_SWAPENTS); - - points *= 1000; - points /= totalpages; + points = get_mm_rss(p->mm) + p->mm->nr_ptes + + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); /* @@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= 30; + points -= 30 * totalpages / 1000; /* * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may * either completely disable oom killing or always prefer a certain * task. */ - points += p->signal->oom_score_adj; + points += p->signal->oom_score_adj * totalpages / 1000; /* - * Never return 0 for an eligible task that may be killed since it's - * possible that no single user task uses more than 0.1% of memory and - * no single admin tasks uses more than 3.0%. + * Never return 0 for an eligible task regardless of the root bonus and + * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - if (points <= 0) - return 1; - return (points < 1000) ? points : 1000; + return points ? points : 1; } /* @@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, { struct task_struct *g, *p; struct task_struct *chosen = NULL; - *ppoints = 0; + unsigned long chosen_points = 0; do_each_thread(g, p) { unsigned int points; @@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (p == current) { chosen = p; - *ppoints = 1000; + chosen_points = ULONG_MAX; } else if (!force_kill) { /* * If this task is not being ptraced on exit, @@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, } points = oom_badness(p, memcg, nodemask, totalpages); - if (points > *ppoints) { + if (points > chosen_points) { chosen = p; - *ppoints = points; + chosen_points = points; } } while_each_thread(g, p); + *ppoints = chosen_points * 1000 / totalpages; return chosen; } @@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; + limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; read_lock(&tasklist_lock); p = select_bad_process(&points, limit, memcg, NULL, false); if (p && PTR_ERR(p) != -1UL) -- cgit v1.2.3-70-g09d2 From dbda591d920b4c7692725b13e3f68ecb251e9080 Mon Sep 17 00:00:00 2001 From: KyongHo Date: Tue, 29 May 2012 15:06:49 -0700 Subject: mm: fix faulty initialization in vmalloc_init() The transfer of ->flags causes some of the static mapping virtual addresses to be prematurely freed (before the mapping is removed) because VM_LAZY_FREE gets "set" if tmp->flags has VM_IOREMAP set. This might cause subsequent vmalloc/ioremap calls to fail because it might allocate one of the freed virtual address ranges that aren't unmapped. va->flags has different types of flags from tmp->flags. If a region with VM_IOREMAP set is registered with vm_area_add_early(), it will be removed by __purge_vmap_area_lazy(). Fix vmalloc_init() to correctly initialize vmap_area for the given vm_struct. Also initialise va->vm. If it is not set, find_vm_area() for the early vm regions will always fail. Signed-off-by: KyongHo Cho Cc: "Olav Haugan" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c28b0b9e5cc..2aad49981b5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1185,9 +1185,10 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); - va->flags = tmp->flags | VM_VM_AREA; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; + va->vm = tmp; __insert_vmap_area(va); } -- cgit v1.2.3-70-g09d2 From 5bf5f03c271907978489868a4c72aeb42b5127d2 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 29 May 2012 15:06:49 -0700 Subject: mm: fix slab->page flags corruption Transparent huge pages can change page->flags (PG_compound_lock) without taking Slab lock. Since THP can not break slab pages we can safely access compound page without taking compound lock. Specifically this patch fixes a race between compound_unlock() and slab functions which perform page-flags updates. This can occur when get_page()/put_page() is called on a page from slab. [akpm@linux-foundation.org: tweak comment text, fix comment layout, fix label indenting] Reported-by: Amey Bhide Signed-off-by: Pravin B Shelar Reviewed-by: Christoph Lameter Acked-by: Andrea Arcangeli Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/swap.c | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index aa20bafa40f..ce26716238c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -321,6 +321,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline void compound_lock(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(PageSlab(page)); bit_spin_lock(PG_compound_lock, &page->flags); #endif } @@ -328,6 +329,7 @@ static inline void compound_lock(struct page *page) static inline void compound_unlock(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(PageSlab(page)); bit_spin_unlock(PG_compound_lock, &page->flags); #endif } diff --git a/mm/swap.c b/mm/swap.c index 5c13f133897..6fdd72ec15b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -82,6 +82,25 @@ static void put_compound_page(struct page *page) if (likely(page != page_head && get_page_unless_zero(page_head))) { unsigned long flags; + + /* + * THP can not break up slab pages so avoid taking + * compound_lock(). Slab performs non-atomic bit ops + * on page->flags for better performance. In particular + * slab_unlock() in slub used to be a hot path. It is + * still hot on arches that do not support + * this_cpu_cmpxchg_double(). + */ + if (PageSlab(page_head)) { + if (PageTail(page)) { + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + + atomic_dec(&page->_mapcount); + goto skip_lock_tail; + } else + goto skip_lock; + } /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time @@ -92,10 +111,10 @@ static void put_compound_page(struct page *page) if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); - VM_BUG_ON(PageHead(page_head)); +skip_lock: if (put_page_testzero(page_head)) __put_single_page(page_head); - out_put_single: +out_put_single: if (put_page_testzero(page)) __put_single_page(page); return; @@ -115,6 +134,8 @@ static void put_compound_page(struct page *page) VM_BUG_ON(atomic_read(&page_head->_count) <= 0); VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); + +skip_lock_tail: if (put_page_testzero(page_head)) { if (PageHead(page_head)) __put_compound_page(page_head); @@ -162,6 +183,18 @@ bool __get_page_tail(struct page *page) struct page *page_head = compound_trans_head(page); if (likely(page != page_head && get_page_unless_zero(page_head))) { + + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head)) { + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + return true; + } else { + put_page(page_head); + return false; + } + } + /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time -- cgit v1.2.3-70-g09d2 From 4e2f07750d9a94e8f23e86408df5ab95be88bf11 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:50 -0700 Subject: mm/memblock: cleanup on duplicate VA/PA conversion The overall memblock has been organized into the memory regions and reserved regions. Initially, the memory regions and reserved regions are stored in the predetermined arrays of "struct memblock _region". It's possible for the arrays to be enlarged when we have newly added regions for them, but no enough space there. Under the situation, We will created double-sized array to meet the requirement. However, the original implementation converted the VA (Virtual Address) of the newly allocated array of regions to PA (Physical Address), then translate back when we allocates the new array from slab. That's actually unnecessary. The patch removes the duplicate VA/PA conversion. Signed-off-by: Gavin Shan Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index a44eab3157f..eae06ea3aa5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -212,14 +212,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; - } else + } else { addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + new_array = addr ? __va(addr) : 0; + } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; } - new_array = __va(addr); memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); -- cgit v1.2.3-70-g09d2 From 181eb39425f2b9275afcb015eaa547d11f71a02f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:50 -0700 Subject: mm/memblock: fix memory leak on extending regions The overall memblock has been organized into the memory regions and reserved regions. Initially, the memory regions and reserved regions are stored in the predetermined arrays of "struct memblock _region". It's possible for the arrays to be enlarged when we have newly added regions, but no free space left there. The policy here is to create double-sized array either by slab allocator or memblock allocator. Unfortunately, we didn't free the old array, which might be allocated through slab allocator before. That would cause memory leak. The patch introduces 2 variables to trace where (slab or memblock) the memory and reserved regions come from. The memory for the memory or reserved regions will be deallocated by kfree() if that was allocated by slab allocator. Thus to fix the memory leak issue. Signed-off-by: Gavin Shan Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index eae06ea3aa5..952123eba43 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = { int memblock_debug __initdata_memblock; static int memblock_can_resize __initdata_memblock; +static int memblock_memory_in_slab __initdata_memblock = 0; +static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ static inline const char *memblock_type_name(struct memblock_type *type) @@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) struct memblock_region *new_array, *old_array; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); + int *in_slab; /* We don't allow resizing until we know about the reserved regions * of memory that aren't suitable for allocation @@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* Retrieve the slab flag */ + if (type == &memblock.memory) + in_slab = &memblock_memory_in_slab; + else + in_slab = &memblock_reserved_in_slab; + /* Try to find some space for it. * * WARNING: We assume that either slab_is_available() and we use it or @@ -235,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) type->regions = new_array; type->max <<= 1; - /* If we use SLAB that's it, we are done */ - if (use_slab) - return 0; - - /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_reserve(addr, new_size)); - - /* If the array wasn't our static init one, then free it. We only do - * that before SLAB is available as later on, we don't know whether - * to use kfree or free_bootmem_pages(). Shouldn't be a big deal - * anyways + /* Free old array. We needn't free it if the array is the + * static one */ - if (old_array != memblock_memory_init_regions && - old_array != memblock_reserved_init_regions) + if (*in_slab) + kfree(old_array); + else if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) memblock_free(__pa(old_array), old_size); + /* Reserve the new array if that comes from the memblock. + * Otherwise, we needn't do it + */ + if (!use_slab) + BUG_ON(memblock_reserve(addr, new_size)); + + /* Update slab flag */ + *in_slab = use_slab; + return 0; } -- cgit v1.2.3-70-g09d2 From 4b91355e9dc9ac1eb3d69e56de093899ff2677ef Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 29 May 2012 15:06:51 -0700 Subject: memcg: fix/change behavior of shared anon at moving task This patch changes memcg's behavior at task_move(). At task_move(), the kernel scans a task's page table and move the changes for mapped pages from source cgroup to target cgroup. There has been a bug at handling shared anonymous pages for a long time. Before patch: - The spec says 'shared anonymous pages are not moved.' - The implementation was 'shared anonymoys pages may be moved'. If page_mapcount <=2, shared anonymous pages's charge were moved. After patch: - The spec says 'all anonymous pages are moved'. - The implementation is 'all anonymous pages are moved'. Considering usage of memcg, this will not affect user's experience. 'shared anonymous' pages only exists between a tree of processes which don't do exec(). Moving one of process without exec() seems not sane. For example, libcgroup will not be affected by this change. (Anyway, no one noticed the implementation for a long time...) Below is a discussion log: - current spec/implementation are complex - Now, shared file caches are moved - It adds unclear check as page_mapcount(). To do correct check, we should check swap users, etc. - No one notice this implementation behavior. So, no one get benefit from the design. - In general, once task is moved to a cgroup for running, it will not be moved.... - Finally, we have control knob as memory.move_charge_at_immigrate. Here is a patch to allow moving shared pages, completely. This makes memcg simpler and fix current broken code. Suggested-by: Hugh Dickins Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Naoya Horiguchi Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 9 ++++----- include/linux/swap.h | 9 --------- mm/memcontrol.c | 22 ++++++++++++++-------- mm/swapfile.c | 31 ------------------------------- 4 files changed, 18 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index e479007f1a7..6a066a270fc 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -184,12 +184,14 @@ behind this approach is that a cgroup that aggressively uses a shared page will eventually get charged for it (once it is uncharged from the cgroup that brought it in -- this will happen on memory pressure). +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used. When you do swapoff and make swapped-out pages of shmem(tmpfs) to be backed into memory in force, charges for pages are accounted against the caller of swapoff rather than the users of shmem. - 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) Swap Extension allows you to record charge for swap. A swapped-in page is @@ -615,8 +617,7 @@ memory cgroup. bit | what type of charges would be moved ? -----+------------------------------------------------------------------------ 0 | A charge of an anonymous page(or swap of it) used by the target task. - | Those pages and swaps must be used only by the target task. You must - | enable Swap Extension(see 2.4) to enable move of swap charges. + | You must enable Swap Extension(see 2.4) to enable move of swap charges. -----+------------------------------------------------------------------------ 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) | and swaps of tmpfs file) mmapped by the target task. Unlike the case of @@ -629,8 +630,6 @@ memory cgroup. 8.3 TODO -- Implement madvise(2) to let users decide the vma to be moved or not to be - moved. - All of moving charge operations are done under cgroup_mutex. It's not good behavior to hold the mutex too long, so we may need some trick. diff --git a/include/linux/swap.h b/include/linux/swap.h index d965c4bfab3..49c0fa9ef5c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -359,7 +359,6 @@ struct backing_dev_info; #ifdef CONFIG_CGROUP_MEM_RES_CTLR extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); -extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep); #else static inline void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) @@ -470,14 +469,6 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) { } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -static inline int -mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) -{ - return 0; -} -#endif - #endif /* CONFIG_SWAP */ #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d7ce417cae7..e7db70f3d2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5154,7 +5154,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return NULL; if (PageAnon(page)) { /* we don't move shared anon */ - if (!move_anon() || page_mapcount(page) > 2) + if (!move_anon()) return NULL; } else if (!move_file()) /* we ignore mapcount for file pages */ @@ -5165,26 +5165,32 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return page; } +#ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { - int usage_count; struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); if (!move_anon() || non_swap_entry(ent)) return NULL; - usage_count = mem_cgroup_count_swap_user(ent, &page); - if (usage_count > 1) { /* we don't move shared anon */ - if (page) - put_page(page); - return NULL; - } + /* + * Because lookup_swap_cache() updates some statistics counter, + * we call find_get_page() with swapper_space directly. + */ + page = find_get_page(&swapper_space, ent.val); if (do_swap_account) entry->val = ent.val; return page; } +#else +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + return NULL; +} +#endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) diff --git a/mm/swapfile.c b/mm/swapfile.c index b0c86e92f42..457b10baef5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_count_swap_user - count the user of a swap entry - * @ent: the swap entry to be checked - * @pagep: the pointer for the swap cache page of the entry to be stored - * - * Returns the number of the user of the swap entry. The number is valid only - * for swaps of anonymous pages. - * If the entry is found on swap cache, the page is stored to pagep with - * refcount of it being incremented. - */ -int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) -{ - struct page *page; - struct swap_info_struct *p; - int count = 0; - - page = find_get_page(&swapper_space, ent.val); - if (page) - count += page_mapcount(page); - p = swap_info_get(ent); - if (p) { - count += swap_count(p->swap_map[swp_offset(ent)]); - spin_unlock(&swap_lock); - } - - *pagep = page; - return count; -} -#endif - #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). -- cgit v1.2.3-70-g09d2 From e91cbb42531626cd4fd0673ca01daf53e338d8f9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:51 -0700 Subject: memcg swap: mem_cgroup_move_swap_account never needs fixup The need_fixup arg to mem_cgroup_move_swap_account() is always false, so just remove it. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e7db70f3d2d..d5ba631e0ec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3165,7 +3165,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * @entry: swap entry to be moved * @from: mem_cgroup which the entry is moved from * @to: mem_cgroup which the entry is moved to - * @need_fixup: whether we should fixup res_counters and refcounts. * * It succeeds only when the swap_cgroup's record for this entry is the same * as the mem_cgroup's id of @from. @@ -3176,7 +3175,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * both res and memsw, and called css_get(). */ static int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) + struct mem_cgroup *from, struct mem_cgroup *to) { unsigned short old_id, new_id; @@ -3195,24 +3194,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * swap-in, the refcount of @to might be decreased to 0. */ mem_cgroup_get(to); - if (need_fixup) { - if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE); - mem_cgroup_put(from); - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - if (!mem_cgroup_is_root(to)) - res_counter_uncharge(&to->res, PAGE_SIZE); - } return 0; } return -EINVAL; } #else static inline int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) + struct mem_cgroup *from, struct mem_cgroup *to) { return -EINVAL; } @@ -5546,8 +5534,7 @@ put: /* get_mctgt_type() gets the page */ break; case MC_TARGET_SWAP: ent = target.ent; - if (!mem_cgroup_move_swap_account(ent, - mc.from, mc.to, false)) { + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { mc.precharge--; /* we fixup refcnts and charges later. */ mc.moved_swap++; -- cgit v1.2.3-70-g09d2 From 86493009d3b7e51eee38575f9537b754f5b6c536 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:52 -0700 Subject: memcg swap: use mem_cgroup_uncharge_swap() That stuff __mem_cgroup_commit_charge_swapin() does with a swap entry, it has a name and even a declaration: just use mem_cgroup_uncharge_swap(). Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ba631e0ec..30f938c8645 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2855,24 +2855,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; - struct mem_cgroup *swap_memcg; - unsigned short id; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - swap_memcg = mem_cgroup_lookup(id); - if (swap_memcg) { - /* - * This recorded memcg can be obsolete one. So, avoid - * calling css_tryget - */ - if (!mem_cgroup_is_root(swap_memcg)) - res_counter_uncharge(&swap_memcg->memsw, - PAGE_SIZE); - mem_cgroup_swap_statistics(swap_memcg, false); - mem_cgroup_put(swap_memcg); - } - rcu_read_unlock(); + mem_cgroup_uncharge_swap(ent); } /* * At swapin, we may charge account against cgroup which has no tasks. -- cgit v1.2.3-70-g09d2 From c3c787e8c38557ccf44c670d73aebe630a2b1479 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:52 -0700 Subject: mm/memcg: scanning_global_lru means mem_cgroup_disabled Although one has to admire the skill with which it has been concealed, scanning_global_lru(mz) is actually just an interesting way to test mem_cgroup_disabled(). Too many developer hours have been wasted on confusing it with global_reclaim(): just use mem_cgroup_disabled(). Signed-off-by: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Acked-by: KAMEZAWA Hiroyuki Acked-by: Glauber Costa Cc: Michal Hocko Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index ee975302877..52fac58b446 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -140,26 +140,16 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool scanning_global_lru(struct mem_cgroup_zone *mz) -{ - return !mz->mem_cgroup; -} #else static bool global_reclaim(struct scan_control *sc) { return true; } - -static bool scanning_global_lru(struct mem_cgroup_zone *mz) -{ - return true; -} #endif static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) { - if (!scanning_global_lru(mz)) + if (!mem_cgroup_disabled()) return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); return &mz->zone->reclaim_stat; @@ -168,7 +158,7 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, enum lru_list lru) { - if (!scanning_global_lru(mz)) + if (!mem_cgroup_disabled()) return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, zone_to_nid(mz->zone), zone_idx(mz->zone), @@ -1589,7 +1579,7 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz) if (!total_swap_pages) return 0; - if (!scanning_global_lru(mz)) + if (!mem_cgroup_disabled()) return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, mz->zone); @@ -1628,7 +1618,7 @@ static int inactive_file_is_low_global(struct zone *zone) */ static int inactive_file_is_low(struct mem_cgroup_zone *mz) { - if (!scanning_global_lru(mz)) + if (!mem_cgroup_disabled()) return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, mz->zone); -- cgit v1.2.3-70-g09d2 From 89abfab133ef1f5902abafb744df72793213ac19 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:53 -0700 Subject: mm/memcg: move reclaim_stat into lruvec With mem_cgroup_disabled() now explicit, it becomes clear that the zone_reclaim_stat structure actually belongs in lruvec, per-zone when memcg is disabled but per-memcg per-zone when it's enabled. We can delete mem_cgroup_get_reclaim_stat(), and change update_page_reclaim_stat() to update just the one set of stats, the one which get_scan_count() will actually use. Signed-off-by: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Reviewed-by: Minchan Kim Reviewed-by: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 --------- include/linux/mmzone.h | 29 ++++++++++++++--------------- mm/memcontrol.c | 27 +++++++-------------------- mm/page_alloc.c | 8 ++++---- mm/swap.c | 14 ++++---------- mm/vmscan.c | 5 +---- 6 files changed, 30 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18ea0b7baf3..cfe9050ad8d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -126,8 +126,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lrumask); -struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, - struct zone *zone); struct zone_reclaim_stat* mem_cgroup_get_reclaim_stat_from_page(struct page *page); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, @@ -356,13 +354,6 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, return 0; } - -static inline struct zone_reclaim_stat* -mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) -{ - return NULL; -} - static inline struct zone_reclaim_stat* mem_cgroup_get_reclaim_stat_from_page(struct page *page) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4871e31ae27..1b89861eedc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -185,8 +185,22 @@ static inline int is_unevictable_lru(enum lru_list lru) return (lru == LRU_UNEVICTABLE); } +struct zone_reclaim_stat { + /* + * The pageout code in vmscan.c keeps track of how many of the + * mem/swap backed and file backed pages are refeferenced. + * The higher the rotated/scanned ratio, the more valuable + * that cache is. + * + * The anon LRU stats live in [0], file LRU stats in [1] + */ + unsigned long recent_rotated[2]; + unsigned long recent_scanned[2]; +}; + struct lruvec { struct list_head lists[NR_LRU_LISTS]; + struct zone_reclaim_stat reclaim_stat; }; /* Mask used at gathering information at once (see memcontrol.c) */ @@ -313,19 +327,6 @@ enum zone_type { #error ZONES_SHIFT -- too many zones configured adjust calculation #endif -struct zone_reclaim_stat { - /* - * The pageout code in vmscan.c keeps track of how many of the - * mem/swap backed and file backed pages are refeferenced. - * The higher the rotated/scanned ratio, the more valuable - * that cache is. - * - * The anon LRU stats live in [0], file LRU stats in [1] - */ - unsigned long recent_rotated[2]; - unsigned long recent_scanned[2]; -}; - struct zone { /* Fields commonly accessed by the page allocator */ @@ -407,8 +408,6 @@ struct zone { spinlock_t lru_lock; struct lruvec lruvec; - struct zone_reclaim_stat reclaim_stat; - unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30f938c8645..00c8898dbb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -138,7 +138,6 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; - struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -1243,16 +1242,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) return (active > inactive); } -struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, - struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return &mz->reclaim_stat; -} - struct zone_reclaim_stat * mem_cgroup_get_reclaim_stat_from_page(struct page *page) { @@ -1268,7 +1257,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - return &mz->reclaim_stat; + return &mz->lruvec.reclaim_stat; } #define mem_cgroup_from_res_counter(counter, member) \ @@ -4216,21 +4205,19 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, { int nid, zid; struct mem_cgroup_per_zone *mz; + struct zone_reclaim_stat *rstat; unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0}; for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_zoneinfo(memcg, nid, zid); + rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += - mz->reclaim_stat.recent_rotated[0]; - recent_rotated[1] += - mz->reclaim_stat.recent_rotated[1]; - recent_scanned[0] += - mz->reclaim_stat.recent_scanned[0]; - recent_scanned[1] += - mz->reclaim_stat.recent_scanned[1]; + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; } cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); cb->fill(cb, "recent_rotated_file", recent_rotated[1]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4fc462b5fcf..8cbfc38e68a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4410,10 +4410,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(lru) INIT_LIST_HEAD(&zone->lruvec.lists[lru]); - zone->reclaim_stat.recent_rotated[0] = 0; - zone->reclaim_stat.recent_rotated[1] = 0; - zone->reclaim_stat.recent_scanned[0] = 0; - zone->reclaim_stat.recent_scanned[1] = 0; + zone->lruvec.reclaim_stat.recent_rotated[0] = 0; + zone->lruvec.reclaim_stat.recent_rotated[1] = 0; + zone->lruvec.reclaim_stat.recent_scanned[0] = 0; + zone->lruvec.reclaim_stat.recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) diff --git a/mm/swap.c b/mm/swap.c index 6fdd72ec15b..0503ad705e7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -312,21 +312,15 @@ void rotate_reclaimable_page(struct page *page) static void update_page_reclaim_stat(struct zone *zone, struct page *page, int file, int rotated) { - struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; - struct zone_reclaim_stat *memcg_reclaim_stat; + struct zone_reclaim_stat *reclaim_stat; - memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); + reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); + if (!reclaim_stat) + reclaim_stat = &zone->lruvec.reclaim_stat; reclaim_stat->recent_scanned[file]++; if (rotated) reclaim_stat->recent_rotated[file]++; - - if (!memcg_reclaim_stat) - return; - - memcg_reclaim_stat->recent_scanned[file]++; - if (rotated) - memcg_reclaim_stat->recent_rotated[file]++; } static void __activate_page(struct page *page, void *arg) diff --git a/mm/vmscan.c b/mm/vmscan.c index 52fac58b446..e234ada1874 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -149,10 +149,7 @@ static bool global_reclaim(struct scan_control *sc) static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) { - if (!mem_cgroup_disabled()) - return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); - - return &mz->zone->reclaim_stat; + return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat; } static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, -- cgit v1.2.3-70-g09d2