From 86a595f961360c9c31d00fb111dd09e5d5ba9a40 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 25 Oct 2012 13:37:56 -0700 Subject: mm/page_alloc.c:alloc_contig_range(): return early for err path If start_isolate_page_range() failed, unset_migratetype_isolate() has been done inside it. Signed-off-by: Bob Liu Cc: Ni zhan Chen Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb90971182b..b0012ab372a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5825,7 +5825,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); if (ret) - goto done; + return ret; ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) -- cgit v1.2.3-70-g09d2 From 6b187d0260b6cd1d0904309f32659b7ed5948af8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 25 Oct 2012 13:38:08 -0700 Subject: mm, numa: avoid setting zone_reclaim_mode unless a node is sufficiently distant Commit 957f822a0ab9 ("mm, numa: reclaim from all nodes within reclaim distance") caused zone_reclaim_mode to be set for all systems where two nodes are within RECLAIM_DISTANCE of each other. This is the opposite of what we actually want: zone_reclaim_mode should be set if two nodes are sufficiently distant. Signed-off-by: David Rientjes Reported-by: Julian Wollrath Tested-by: Julian Wollrath Cc: Hugh Dickins Cc: Patrik Kullman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0012ab372a..5b74de6702e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,10 +1809,10 @@ static void __paginginit init_zone_allows_reclaim(int nid) int i; for_each_online_node(i) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); + else zone_reclaim_mode = 1; - } } #else /* CONFIG_NUMA */ -- cgit v1.2.3-70-g09d2 From bea8c150a7efbc0f204e709b7274fe273f55e0d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:14:54 -0800 Subject: memcg: fix hotplugged memory zone oops When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: Wen Congyang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++----------- mm/mmzone.c | 6 +----- mm/page_alloc.c | 2 +- 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 50aaca81f63..a23923ba826 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -752,7 +752,7 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size, enum memmap_context context); -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); +extern void lruvec_init(struct lruvec *lruvec); static inline struct zone *lruvec_zone(struct lruvec *lruvec) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93a7e36ded8..dd39ba000b3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, struct mem_cgroup *memcg) { struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /* @@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; struct page_cgroup *pc; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; @@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) pc->mem_cgroup = memcg = root_mem_cgroup; mz = page_cgroup_zoneinfo(memcg, page); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /** @@ -3697,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; unsigned long flags, loop; struct list_head *list; struct page *busy; struct zone *zone; zone = &NODE_DATA(node)->node_zones[zid]; - mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lruvec.lists[lru]; + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + list = &lruvec->lists[lru]; - loop = mz->lru_size[lru]; + loop = mem_cgroup_get_lru_size(lruvec, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; @@ -4745,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); + lruvec_init(&mz->lruvec); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; diff --git a/mm/mmzone.c b/mm/mmzone.c index 3cef80f6ac7..4596d81b89b 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn, } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ -void lruvec_init(struct lruvec *lruvec, struct zone *zone) +void lruvec_init(struct lruvec *lruvec) { enum lru_list lru; @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); - -#ifdef CONFIG_MEMCG - lruvec->zone = zone; -#endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b74de6702e..c91598b1b4c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - lruvec_init(&zone->lruvec, zone); + lruvec_init(&zone->lruvec); if (!size) continue; -- cgit v1.2.3-70-g09d2 From 5576646f3c1abd60d72d19829de6f5d8c2ca8ecf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 16 Nov 2012 14:15:06 -0800 Subject: revert "mm: fix-up zone present pages" Revert commit 7f1290f2f2a4 ("mm: fix-up zone present pages") That patch tried to fix a issue when calculating zone->present_pages, but it caused a regression on 32bit systems with HIGHMEM. With that change, reset_zone_present_pages() resets all zone->present_pages to zero, and fixup_zone_present_pages() is called to recalculate zone->present_pages when the boot allocator frees core memory pages into buddy allocator. Because highmem pages are not freed by bootmem allocator, all highmem zones' present_pages becomes zero. Various options for improving the situation are being discussed but for now, let's return to the 3.6 code. Cc: Jianguo Wu Cc: Jiang Liu Cc: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Acked-by: David Rientjes Tested-by: Chris Clayton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 1 - include/linux/mm.h | 4 ---- mm/bootmem.c | 10 +--------- mm/memory_hotplug.c | 7 ------- mm/nobootmem.c | 3 --- mm/page_alloc.c | 34 ---------------------------------- 6 files changed, 1 insertion(+), 58 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index acd5b68e887..082e383c1b6 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -637,7 +637,6 @@ mem_init (void) high_memory = __va(max_low_pfn * PAGE_SIZE); - reset_zone_present_pages(); for_each_online_pgdat(pgdat) if (pgdat->bdata->node_bootmem_map) totalram_pages += free_all_bootmem_node(pgdat); diff --git a/include/linux/mm.h b/include/linux/mm.h index fa068040273..bcaab4e6fe9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1684,9 +1684,5 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ -extern void reset_zone_present_pages(void); -extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn); - #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/bootmem.c b/mm/bootmem.c index 434be4ae7a0..f468185b3b2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,8 +198,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); - fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), - start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -210,9 +208,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); - fixup_zone_present_pages( - page_to_nid(page), - start + off, start + off + 1); count++; } vec >>= 1; @@ -226,11 +221,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) { - fixup_zone_present_pages(page_to_nid(page), - page_to_pfn(page), page_to_pfn(page) + 1); + while (pages--) __free_pages_bootmem(page++, 0); - } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 56b758ae57d..e4eeacae2b9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,7 +106,6 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; - struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -117,12 +116,6 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); - - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages++; - zone_span_writeunlock(zone); - totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 714d5d65047..bd82f6b3141 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,8 +116,6 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); - fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), - start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -128,7 +126,6 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; - reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c91598b1b4c..7bb35ac0964 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,37 +6098,3 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } - -/* reset zone->present_pages */ -void reset_zone_present_pages(void) -{ - struct zone *z; - int i, nid; - - for_each_node_state(nid, N_HIGH_MEMORY) { - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - z->present_pages = 0; - } - } -} - -/* calculate zone's present pages in buddy system */ -void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - struct zone *z; - unsigned long zone_start_pfn, zone_end_pfn; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - zone_start_pfn = z->zone_start_pfn; - zone_end_pfn = zone_start_pfn + z->spanned_pages; - - /* if the two regions intersect */ - if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) - z->present_pages += min(end_pfn, zone_end_pfn) - - max(start_pfn, zone_start_pfn); - } -} -- cgit v1.2.3-70-g09d2 From ef6c5be658f6a70c1256fbd18e18ee0dc24c3386 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 21 Nov 2012 14:21:51 -0500 Subject: fix incorrect NR_FREE_PAGES accounting (appears like memory leak) There have been some 3.7-rc reports of vm issues, including some kswapd bugs and, more importantly, some memory "leaks": http://www.spinics.net/lists/linux-mm/msg46187.html https://bugzilla.kernel.org/show_bug.cgi?id=50181 Commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available") took split_free_page() and reused it for the compaction code. It does something curious with capture_free_page() (previously known as split_free_page()): int capture_free_page(struct page *page, int alloc_order, ... __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); + if (alloc_order != order) + expand(zone, page, alloc_order, order, + &zone->free_area[order], migratetype); Note that expand() puts the pages _back_ in the allocator, but it does not bump NR_FREE_PAGES. We "return" 'alloc_order' worth of pages, but we accounted for removing 'order' in the __mod_zone_page_state() call. For the old split_page()-style use (order==alloc_order) the bug will not trigger. But, when called from the compaction code where we occasionally get a larger page out of the buddy allocator than we need, we will run in to this. This patch simply changes the NR_FREE_PAGES manipulation to the correct 'alloc_order' instead of 'order'. I've been able to repeatedly trigger this in my testing environment. The amount "leaked" very closely tracks the imbalance I see in buddy pages vs. NR_FREE_PAGES. I have confirmed that this patch fixes the imbalance Signed-off-by: Dave Hansen Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7bb35ac0964..bcb72c6e2b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1405,7 +1405,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) mt = get_pageblock_migratetype(page); if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); + __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); if (alloc_order != order) expand(zone, page, alloc_order, order, -- cgit v1.2.3-70-g09d2 From 82b212f40059bffd6808c07266a942d444d5558a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:45 -0800 Subject: Revert "mm: remove __GFP_NO_KSWAPD" With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. The temptation is to supply a patch that checks if kswapd was woken for THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not backed up by proper testing. As 3.7 is very close to release and this is not a bug we should release with, a safer path is to revert "mm: remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing out the balance_pgdat() logic in general. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/mtdcore.c | 6 ++++-- include/linux/gfp.h | 5 ++++- include/trace/events/gfpflags.h | 1 + mm/page_alloc.c | 7 ++++--- 4 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 374c46dff7d..ec794a72975 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1077,7 +1077,8 @@ EXPORT_SYMBOL_GPL(mtd_writev); * until the request succeeds or until the allocation size falls below * the system page size. This attempts to make sure it does not adversely * impact system performance, so when allocating more than one page, we - * ask the memory allocator to avoid re-trying. + * ask the memory allocator to avoid re-trying, swapping, writing back + * or performing I/O. * * Note, this function also makes sure that the allocated buffer is aligned to * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. @@ -1091,7 +1092,8 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; + gfp_t flags = __GFP_NOWARN | __GFP_WAIT | + __GFP_NORETRY | __GFP_NO_KSWAPD; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 02c1c9710be..d0a79678f16 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,6 +31,7 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u #define ___GFP_NOTRACK 0x200000u +#define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u @@ -85,6 +86,7 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ @@ -114,7 +116,8 @@ struct vm_area_struct; __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 9391706e925..d6fd8e5b14b 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,6 +36,7 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ + {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bcb72c6e2b2..92871579cbe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2494,7 +2495,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.3-70-g09d2 From 58d002097b98664e2a39cc708f30d11549d870b2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 13:54:20 -0800 Subject: mm: compaction: fix return value of capture_free_page() Commit ef6c5be658f6 ("fix incorrect NR_FREE_PAGES accounting (appears like memory leak)") fixes a NR_FREE_PAGE accounting leak but missed the return value which was also missed by this reviewer until today. That return value is used by compaction when adding pages to a list of isolated free pages and without this follow-up fix, there is a risk of free list corruption. Signed-off-by: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 92871579cbe..7e208f0ad68 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1422,7 +1422,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << order; + return 1UL << alloc_order; } /* -- cgit v1.2.3-70-g09d2 From a50915394f1fc02c2861d3b7ce7014788aa5066e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 Nov 2012 13:54:27 -0800 Subject: revert "Revert "mm: remove __GFP_NO_KSWAPD"" It apepars that this patch was innocent, and we hope that "mm: avoid waking kswapd for THP allocations when compaction is deferred or contended" will fix the final kswapd-spinning cause. Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/mtdcore.c | 6 ++---- include/linux/gfp.h | 13 +++++-------- include/trace/events/gfpflags.h | 1 - mm/page_alloc.c | 7 +++---- 4 files changed, 10 insertions(+), 17 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index ec794a72975..374c46dff7d 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1077,8 +1077,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); * until the request succeeds or until the allocation size falls below * the system page size. This attempts to make sure it does not adversely * impact system performance, so when allocating more than one page, we - * ask the memory allocator to avoid re-trying, swapping, writing back - * or performing I/O. + * ask the memory allocator to avoid re-trying. * * Note, this function also makes sure that the allocated buffer is aligned to * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. @@ -1092,8 +1091,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | - __GFP_NORETRY | __GFP_NO_KSWAPD; + gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d0a79678f16..76e1aa206f5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,10 +30,9 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_NOTRACK 0x200000u -#define ___GFP_NO_KSWAPD 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u +#define ___GFP_NOTRACK 0x100000u +#define ___GFP_OTHER_NODE 0x200000u +#define ___GFP_WRITE 0x400000u /* * GFP bitmasks.. @@ -86,7 +85,6 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ -#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ @@ -96,7 +94,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -116,8 +114,7 @@ struct vm_area_struct; __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ - __GFP_NO_KSWAPD) + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b..9391706e925 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,7 +36,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68..8193809f3de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2495,7 +2494,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.3-70-g09d2 From 782fd30406ecb9d9b082816abe0c6008fc72a7b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 13:54:30 -0800 Subject: mm: avoid waking kswapd for THP allocations when compaction is deferred or contended With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. This patch defers when kswapd gets woken up for THP allocations. For !THP allocations, kswapd is always woken up. For THP allocations, kswapd is woken up iff the process is willing to enter into direct reclaim/compaction. [akpm@linux-foundation.org: fix typo in comment] Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Valdis Kletnieks Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8193809f3de..a8f2c87792c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2378,6 +2378,15 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +/* Returns true if the allocation is likely for THP */ +static bool is_thp_alloc(gfp_t gfp_mask, unsigned int order) +{ + if (order == pageblock_order && + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + return true; + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2416,7 +2425,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, + /* The decision whether to wake kswapd for THP is made later */ + if (!is_thp_alloc(gfp_mask, order)) + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* @@ -2487,15 +2498,21 @@ rebalance: goto got_pg; sync_migration = true; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) - goto nopage; + if (is_thp_alloc(gfp_mask, order)) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a movable allocation that does not + * heavily disrupt the system then fail the allocation instead + * of entering direct reclaim. + */ + if (deferred_compaction || contended_compaction) + goto nopage; + + /* If process is willing to reclaim/compact then wake kswapd */ + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); + } /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, -- cgit v1.2.3-70-g09d2