From e19318116048d5fbdb8d230d6d37625834b503cd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:59 -0700 Subject: mm/page_alloc.c: add __meminit to alloc_pages_exact_nid() alloc_pages_exact_nid() is only called by __meminit alloc_page_cgroup() Signed-off-by: Fabian Frederick Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad736ca..fd4322cc096 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2962,7 +2962,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); -- cgit v1.2.3-70-g09d2 From b95b4e1ed92a203f4bdfc55f53d6e9c2773e3b6d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Aug 2014 16:05:01 -0700 Subject: mm/page_alloc.c: unexport alloc_pages_exact_nid() It is only called by mm/page_cgroup.c whcih cannot be modular. Reported-by: David Rientjes Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd4322cc096..8c4f1b220ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2970,7 +2970,6 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() -- cgit v1.2.3-70-g09d2 From 7be12fc9f8dcae7f4c9647d495bd64fc4ffb6836 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 6 Aug 2014 16:05:15 -0700 Subject: mm: page_alloc: simplify drain_zone_pages by using min() Instead of open-coding getting minimal value of two, just use min macro. That is why it is there for. While changing the function also change type of batch local variable to match type of per_cpu_pages::batch (which is int). Signed-off-by: Michal Nazarewicz Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c4f1b220ab..c1c6cb78e5c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1257,15 +1257,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; -- cgit v1.2.3-70-g09d2 From cc7452b6dca384400960d40090a98d0eb920ab22 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Wed, 6 Aug 2014 16:06:38 -0700 Subject: mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces Historically, we exported shared pages to userspace via sysinfo(2) sharedram and /proc/meminfo's "MemShared" fields. With the advent of tmpfs, from kernel v2.4 onward, that old way for accounting shared mem was deemed inaccurate and we started to export a hard-coded 0 for sysinfo.sharedram. Later on, during the 2.6 timeframe, "MemShared" got re-introduced to /proc/meminfo re-branded as "Shmem", but we're still reporting sysinfo.sharedmem as that old hard-coded zero, which makes the "shared memory" report inconsistent across interfaces. This patch leverages the addition of explicit accounting for pages used by shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem vmstat" -- in order to make the users of sysinfo(2) and si_meminfo*() friends aware of that vmstat entry and make them report it consistently across the interfaces, as well to make sysinfo(2) returned data consistent with our current API documentation states. Signed-off-by: Rafael Aquini Acked-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 +- fs/proc/meminfo.c | 2 +- mm/page_alloc.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 8f7ed9933a7..c6d3ae05f1c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(nid, NR_ANON_PAGES)), - nid, K(node_page_state(nid, NR_SHMEM)), + nid, K(i.sharedram), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, nid, K(node_page_state(nid, NR_PAGETABLE)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7445af0b1aa..aa1eee06420 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SHMEM)), + K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1c6cb78e5c..0987ac9f0a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3047,7 +3047,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3067,6 +3067,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; -- cgit v1.2.3-70-g09d2 From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:14 -0700 Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines The arrangement of struct zone has changed over time and now it has reached the point where there is some inappropriate sharing going on. On x86-64 for example o The zone->node field is shared with the zone lock and zone->node is accessed frequently from the page allocator due to the fair zone allocation policy. o span_seqlock is almost never used by shares a line with free_area o Some zone statistics share a cache line with the LRU lock so reclaim-intensive and allocator-intensive workloads can bounce the cache line on a stat update This patch rearranges struct zone to put read-only and read-mostly fields together and then splits the page allocator intensive fields, the zone statistics and the page reclaim intensive fields into their own cache lines. Note that the type of lowmem_reserve changes due to the watermark calculations being signed and avoiding a signed/unsigned conversion there. On the test configuration I used the overall size of struct zone shrunk by one cache line. On smaller machines, this is not likely to be noticable. However, on a 4-node NUMA machine running tiobench the system CPU overhead is reduced by this patch. 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5r9 User 746.94 759.78 System 65336.22 58350.98 Elapsed 27553.52 27282.02 Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 211 +++++++++++++++++++++++++------------------------ mm/page_alloc.c | 7 +- mm/vmstat.c | 4 +- 3 files changed, 113 insertions(+), 109 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 559e659288f..ed0876bb902 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -324,18 +324,11 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H struct zone { - /* Fields commonly accessed by the page allocator */ + /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; - /* - * When free pages are below this point, additional steps are taken - * when reading the number of free pages to avoid per-cpu counter - * drift allowing watermarks to be breached - */ - unsigned long percpu_drift_mark; - /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several @@ -344,41 +337,26 @@ struct zone { * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ - unsigned long lowmem_reserve[MAX_NR_ZONES]; - - /* - * This is a per-zone reserve of pages that should not be - * considered dirtyable memory. - */ - unsigned long dirty_balance_reserve; + long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; +#endif + /* - * zone reclaim becomes active if more unmapped pages exist. + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this zone's LRU. Maintained by the pageout code. */ - unsigned long min_unmapped_pages; - unsigned long min_slab_pages; -#endif + unsigned int inactive_ratio; + + struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; + /* - * free areas of different sizes + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. */ - spinlock_t lock; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* Set to true when the PG_migrate_skip bits should be cleared */ - bool compact_blockskip_flush; - - /* pfn where compaction free scanner should start */ - unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; -#endif -#ifdef CONFIG_MEMORY_HOTPLUG - /* see spanned/present_pages for more description */ - seqlock_t span_seqlock; -#endif - struct free_area free_area[MAX_ORDER]; + unsigned long dirty_balance_reserve; #ifndef CONFIG_SPARSEMEM /* @@ -388,74 +366,14 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_COMPACTION - /* - * On compaction failure, 1<> PAGE_SHIFT */ unsigned long zone_start_pfn; @@ -500,9 +418,11 @@ struct zone { * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ + unsigned long managed_pages; unsigned long spanned_pages; unsigned long present_pages; - unsigned long managed_pages; + + const char *name; /* * Number of MIGRATE_RESEVE page block. To maintain for just @@ -510,10 +430,95 @@ struct zone { */ int nr_migrate_reserve_block; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif + /* - * rarely used fields: + * wait_table -- the array holding the hash table + * wait_table_hash_nr_entries -- the size of the hash table array + * wait_table_bits -- wait_table_size == (1 << wait_table_bits) + * + * The purpose of all these is to keep track of the people + * waiting for a page to become available and make them + * runnable again when possible. The trouble is that this + * consumes a lot of space, especially when so few things + * wait on pages at a given time. So instead of using + * per-page waitqueues, we use a waitqueue hash table. + * + * The bucket discipline is to sleep on the same queue when + * colliding and wake all in that wait queue when removing. + * When something wakes, it must check to be sure its page is + * truly available, a la thundering herd. The cost of a + * collision is great, but given the expected load of the + * table, they should be so rare as to be outweighed by the + * benefits from the saved space. + * + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the + * primary users of these fields, and in mm/page_alloc.c + * free_area_init_core() performs the initialization of them. */ - const char *name; + wait_queue_head_t *wait_table; + unsigned long wait_table_hash_nr_entries; + unsigned long wait_table_bits; + + ZONE_PADDING(_pad1_) + + /* Write-intensive fields used from the page allocator */ + spinlock_t lock; + + /* free areas of different sizes */ + struct free_area free_area[MAX_ORDER]; + + /* zone flags, see below */ + unsigned long flags; + + ZONE_PADDING(_pad2_) + + /* Write-intensive fields used by page reclaim */ + + /* Fields commonly accessed by the page reclaim scanner */ + spinlock_t lru_lock; + unsigned long pages_scanned; /* since last reclaim */ + struct lruvec lruvec; + + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; + + /* + * When free pages are below this point, additional steps are taken + * when reading the number of free pages to avoid per-cpu counter + * drift allowing watermarks to be breached + */ + unsigned long percpu_drift_mark; + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* pfn where compaction free scanner should start */ + unsigned long compact_cached_free_pfn; + /* pfn where async and sync compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[2]; +#endif + +#ifdef CONFIG_COMPACTION + /* + * On compaction failure, 1<lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1723,7 +1722,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -3254,7 +3253,7 @@ void show_free_areas(unsigned int filter) ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -5575,7 +5574,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49bfd5..8267f77d187 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1077,10 +1077,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); -- cgit v1.2.3-70-g09d2 From 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:16 -0700 Subject: mm: move zone->pages_scanned into a vmstat counter zone->pages_scanned is a write-intensive cache line during page reclaim and it's also updated during page free. Move the counter into vmstat to take advantage of the per-cpu updates and do not update it in the free paths unless necessary. On a small UMA machine running tiobench the difference is marginal. On a 4-node machine the overhead is more noticable. Note that automatic NUMA balancing was disabled for this test as otherwise the system CPU overhead is unpredictable. 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5 vmstat-v5 User 746.94 759.78 774.56 System 65336.22 58350.98 32847.27 Elapsed 27553.52 27282.02 27415.04 Note that the overhead reduction will vary depending on where exactly pages are allocated and freed. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 12 +++++++++--- mm/vmscan.c | 7 ++++--- mm/vmstat.c | 3 ++- 4 files changed, 16 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ed0876bb902..0bd77f730b3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -143,6 +143,7 @@ enum zone_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_PAGES_SCANNED, /* pages scanned since last reclaim */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ @@ -480,7 +481,6 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - unsigned long pages_scanned; /* since last reclaim */ struct lruvec lruvec; /* Evictions & activations on the inactive file list */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7381d11f02..daa01606379 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) @@ -3248,7 +3254,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fec1ba9951..9c8222b499b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -174,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -1508,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1698,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8267f77d187..e574e883fa7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -763,6 +763,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); -- cgit v1.2.3-70-g09d2 From f7b5d647946aae1647bf5cd26c16b3a793c1ac49 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:20 -0700 Subject: mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered The purpose of numa_zonelist_order=zone is to preserve lower zones for use with 32-bit devices. If locality is preferred then the numa_zonelist_order=node policy should be used. Unfortunately, the fair zone allocation policy overrides this by skipping zones on remote nodes until the lower one is found. While this makes sense from a page aging and performance perspective, it breaks the expected zonelist policy. This patch restores the expected behaviour for zone-list ordering. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index daa01606379..6e5e8f76253 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1965,7 +1965,7 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) - continue; + break; if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; } -- cgit v1.2.3-70-g09d2 From 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:22 -0700 Subject: mm: page_alloc: reduce cost of the fair zone allocation policy The fair zone allocation policy round-robins allocations between zones within a node to avoid age inversion problems during reclaim. If the first allocation fails, the batch counts are reset and a second attempt made before entering the slow path. One assumption made with this scheme is that batches expire at roughly the same time and the resets each time are justified. This assumption does not hold when zones reach their low watermark as the batches will be consumed at uneven rates. Allocation failure due to watermark depletion result in additional zonelist scans for the reset and another watermark check before hitting the slowpath. On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA machine it's variable due to the variability of measuring overhead with the vmstat changes. The system CPU overhead comparison looks like 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanilla vmstat-v5 lowercost-v5 User 746.94 774.56 802.00 System 65336.22 32847.27 40852.33 Elapsed 27553.52 27415.04 27368.46 However it is worth noting that the overall benchmark still completed faster and intuitively it makes sense to take as few passes as possible through the zonelists. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +++ mm/page_alloc.c | 101 ++++++++++++++++++++++++++----------------------- 2 files changed, 59 insertions(+), 48 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bd77f730b3..318df705185 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -534,6 +534,7 @@ typedef enum { ZONE_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -571,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); } +static inline int zone_is_fair_depleted(const struct zone *zone) +{ + return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); +} + static inline int zone_is_oom_locked(const struct zone *zone) { return test_bit(ZONE_OOM_LOCKED, &zone->flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e5e8f76253..fb990814847 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1612,6 +1612,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1966,8 +1985,10 @@ zonelist_scan: if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) break; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; + } } /* * When allocating a page cache page for writing, we @@ -2073,13 +2094,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2088,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2767,28 +2789,11 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { - /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not -- cgit v1.2.3-70-g09d2 From e972a070e2d3296cd2e2cc2fd0561ce89a1d5ebf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:52 -0700 Subject: mm, oom: rename zonelist locking functions try_set_zonelist_oom() and clear_zonelist_oom() are not named properly to imply that they require locking semantics to avoid out_of_memory() being reordered. zone_scan_lock is required for both functions to ensure that there is proper locking synchronization. Rename try_set_zonelist_oom() to oom_zonelist_trylock() and rename clear_zonelist_oom() to oom_zonelist_unlock() to imply there is proper locking semantics. At the same time, convert oom_zonelist_trylock() to return bool instead of int since only success and failure are tested. Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 4 ++-- mm/oom_kill.c | 30 +++++++++++++----------------- mm/page_alloc.c | 6 +++--- 3 files changed, 18 insertions(+), 22 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/oom.h b/include/linux/oom.h index 4cd62677feb..647395a1a55 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -55,8 +55,8 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message); -extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); -extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); +extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, int order, const nodemask_t *nodemask); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0a1e1ff035..d33aca1552a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -559,28 +559,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; - int ret = 1; + bool ret = true; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) if (zone_is_oom_locked(zone)) { - ret = 0; + ret = false; goto out; } - } - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - /* - * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zonelist_oom() doesn't succeed - * when it shouldn't. - */ + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_set_flag(zone, ZONE_OOM_LOCKED); - } out: spin_unlock(&zone_scan_lock); @@ -592,15 +589,14 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_clear_flag(zone, ZONE_OOM_LOCKED); - } spin_unlock(&zone_scan_lock); } @@ -695,8 +691,8 @@ void pagefault_out_of_memory(void) return; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); - if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_zonelist_oom(zonelist, GFP_KERNEL); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb990814847..578236089ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2246,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -2285,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } -- cgit v1.2.3-70-g09d2 From 8fe780484d2674eec27e12bb29c07d3e98a7ad21 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:54 -0700 Subject: mm, thp: restructure thp avoidance of light synchronous migration __GFP_NO_KSWAPD, once the way to determine if an allocation was for thp or not, has gained more users. Their use is not necessarily wrong, they are trying to do a memory allocation that can easily fail without disturbing kswapd, so the bit has gained additional usecases. This restructures the check to determine whether MIGRATE_SYNC_LIGHT should be used for memory compaction in the page allocator. Rather than testing solely for __GFP_NO_KSWAPD, test for all bits that must be set for thp allocations. This also moves the check to be done only after the page allocator is aborted for deferred or contended memory compaction since setting migration_mode for this case is pointless. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 578236089ec..18cee0d4c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2638,14 +2638,6 @@ rebalance: if (page) goto got_pg; - /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. - */ - if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; - /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller @@ -2656,6 +2648,15 @@ rebalance: (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, -- cgit v1.2.3-70-g09d2 From 8b375f64dcf45ba5cfb36398b69b877dc35410fa Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Fri, 22 Aug 2014 13:27:36 -0700 Subject: x86/mm/numa: Drop dead code and rename setup_node_data() to setup_alloc_data() The setup_node_data() function allocates a pg_data_t object, inserts it into the node_data[] array and initializes the following fields: node_id, node_start_pfn and node_spanned_pages. However, a few function calls later during the kernel boot, free_area_init_node() re-initializes those fields, possibly with setup_node_data() is not used. This causes a small glitch when running Linux as a hyperv numa guest: SRAT: PXM 0 -> APIC 0x00 -> Node 0 SRAT: PXM 0 -> APIC 0x01 -> Node 0 SRAT: PXM 1 -> APIC 0x02 -> Node 1 SRAT: PXM 1 -> APIC 0x03 -> Node 1 SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff] SRAT: Node 1 PXM 1 [mem 0x80200000-0xf7ffffff] SRAT: Node 1 PXM 1 [mem 0x100000000-0x1081fffff] NUMA: Node 1 [mem 0x80200000-0xf7ffffff] + [mem 0x100000000-0x1081fffff] -> [mem 0x80200000-0x1081fffff] Initmem setup node 0 [mem 0x00000000-0x7fffffff] NODE_DATA [mem 0x7ffdc000-0x7ffeffff] Initmem setup node 1 [mem 0x80800000-0x1081fffff] NODE_DATA [mem 0x1081ea000-0x1081fdfff] crashkernel: memory value expected [ffffea0000000000-ffffea0001ffffff] PMD -> [ffff88007de00000-ffff88007fdfffff] on node 0 [ffffea0002000000-ffffea00043fffff] PMD -> [ffff880105600000-ffff8801077fffff] on node 1 Zone ranges: DMA [mem 0x00001000-0x00ffffff] DMA32 [mem 0x01000000-0xffffffff] Normal [mem 0x100000000-0x1081fffff] Movable zone start for each node Early memory node ranges node 0: [mem 0x00001000-0x0009efff] node 0: [mem 0x00100000-0x7ffeffff] node 1: [mem 0x80200000-0xf7ffffff] node 1: [mem 0x100000000-0x1081fffff] On node 0 totalpages: 524174 DMA zone: 64 pages used for memmap DMA zone: 21 pages reserved DMA zone: 3998 pages, LIFO batch:0 DMA32 zone: 8128 pages used for memmap DMA32 zone: 520176 pages, LIFO batch:31 On node 1 totalpages: 524288 DMA32 zone: 7672 pages used for memmap DMA32 zone: 491008 pages, LIFO batch:31 Normal zone: 520 pages used for memmap Normal zone: 33280 pages, LIFO batch:7 In this dmesg, the SRAT table reports that the memory range for node 1 starts at 0x80200000. However, the line starting with "Initmem" reports that node 1 memory range starts at 0x80800000. The "Initmem" line is reported by setup_node_data() and is wrong, because the kernel ends up using the range as reported in the SRAT table. This commit drops all that dead code from setup_node_data(), renames it to alloc_node_data() and adds a printk() to free_area_init_node() so that we report a node's memory range accurately. Here's the same dmesg section with this patch applied: SRAT: PXM 0 -> APIC 0x00 -> Node 0 SRAT: PXM 0 -> APIC 0x01 -> Node 0 SRAT: PXM 1 -> APIC 0x02 -> Node 1 SRAT: PXM 1 -> APIC 0x03 -> Node 1 SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff] SRAT: Node 1 PXM 1 [mem 0x80200000-0xf7ffffff] SRAT: Node 1 PXM 1 [mem 0x100000000-0x1081fffff] NUMA: Node 1 [mem 0x80200000-0xf7ffffff] + [mem 0x100000000-0x1081fffff] -> [mem 0x80200000-0x1081fffff] NODE_DATA(0) allocated [mem 0x7ffdc000-0x7ffeffff] NODE_DATA(1) allocated [mem 0x1081ea000-0x1081fdfff] crashkernel: memory value expected [ffffea0000000000-ffffea0001ffffff] PMD -> [ffff88007de00000-ffff88007fdfffff] on node 0 [ffffea0002000000-ffffea00043fffff] PMD -> [ffff880105600000-ffff8801077fffff] on node 1 Zone ranges: DMA [mem 0x00001000-0x00ffffff] DMA32 [mem 0x01000000-0xffffffff] Normal [mem 0x100000000-0x1081fffff] Movable zone start for each node Early memory node ranges node 0: [mem 0x00001000-0x0009efff] node 0: [mem 0x00100000-0x7ffeffff] node 1: [mem 0x80200000-0xf7ffffff] node 1: [mem 0x100000000-0x1081fffff] Initmem setup node 0 [mem 0x00001000-0x7ffeffff] On node 0 totalpages: 524174 DMA zone: 64 pages used for memmap DMA zone: 21 pages reserved DMA zone: 3998 pages, LIFO batch:0 DMA32 zone: 8128 pages used for memmap DMA32 zone: 520176 pages, LIFO batch:31 Initmem setup node 1 [mem 0x80200000-0x1081fffff] On node 1 totalpages: 524288 DMA32 zone: 7672 pages used for memmap DMA32 zone: 491008 pages, LIFO batch:31 Normal zone: 520 pages used for memmap Normal zone: 33280 pages, LIFO batch:7 This commit was tested on a two node bare-metal NUMA machine and Linux as a numa guest on hyperv and qemu/kvm. PS: The wrong memory range reported by setup_node_data() seems to be harmless in the current kernel because it's just not used. However, that bad range is used in kernel 2.6.32 to initialize the old boot memory allocator, which causes a crash during boot. Signed-off-by: Luiz Capitulino Acked-by: Rik van Riel Cc: Andi Kleen Cc: David Rientjes Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa.h | 1 - arch/x86/mm/numa.c | 34 ++++++++++++++-------------------- mm/page_alloc.c | 2 ++ 3 files changed, 16 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 4064acae625..01b493e5a99 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -9,7 +9,6 @@ #ifdef CONFIG_NUMA #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) /* * Too small node sizes may confuse the VM badly. Usually they diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a32b706c401..d221374d5ce 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -185,26 +185,14 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -/* Initialize NODE_DATA for a node on the local memory */ -static void __init setup_node_data(int nid, u64 start, u64 end) +/* Allocate NODE_DATA for a node on the local memory */ +static void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); u64 nd_pa; void *nd; int tnid; - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); - /* * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. @@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (tnid != nid) @@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - NODE_DATA(nid)->node_id = nid; - NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; - NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; node_set_online(nid); } @@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) end = max(mi->blk[i].end, end); } - if (start < end) - setup_node_data(nid, start, end); + if (start >= end) + continue; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + continue; + + alloc_node_data(nid); } /* Dump memblock with node info and return. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18cee0d4c8a..d0e3d2fee58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4976,6 +4976,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, + (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); -- cgit v1.2.3-70-g09d2 From abe5f972912d086c080be4bde67750630b6fb38b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 2 Oct 2014 16:21:10 -0700 Subject: mm: page_alloc: fix zone allocation fairness on UP The zone allocation batches can easily underflow due to higher-order allocations or spills to remote nodes. On SMP that's fine, because underflows are expected from concurrency and dealt with by returning 0. But on UP, zone_page_state will just return a wrapped unsigned long, which will get past the <= 0 check and then consider the zone eligible until its watermarks are hit. Commit 3a025760fc15 ("mm: page_alloc: spill to remote nodes before waking kswapd") already made the counter-resetting use atomic_long_read() to accomodate underflows from remote spills, but it didn't go all the way with it. Make it clear that these batches are expected to go negative regardless of concurrency, and use atomic_long_read() everywhere. Fixes: 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") Reported-by: Vlastimil Babka Reported-by: Leon Romanovsky Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18cee0d4c8a..eee96195802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1612,7 +1612,7 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); - if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && !zone_is_fair_depleted(zone)) zone_set_flag(zone, ZONE_FAIR_DEPLETED); @@ -5701,9 +5701,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); -- cgit v1.2.3-70-g09d2 From ad2c8144418c6a81cefe65379fd47bbe8344cef2 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 9 Oct 2014 15:26:13 -0700 Subject: topology: add support for node_to_mem_node() to determine the fallback node Anton noticed (http://www.spinics.net/lists/linux-mm/msg67489.html) that on ppc LPARs with memoryless nodes, a large amount of memory was consumed by slabs and was marked unreclaimable. He tracked it down to slab deactivations in the SLUB core when we allocate remotely, leading to poor efficiency always when memoryless nodes are present. After much discussion, Joonsoo provided a few patches that help significantly. They don't resolve the problem altogether: - memory hotplug still needs testing, that is when a memoryless node becomes memory-ful, we want to dtrt - there are other reasons for going off-node than memoryless nodes, e.g., fully exhausted local nodes Neither case is resolved with this series, but I don't think that should block their acceptance, as they can be explored/resolved with follow-on patches. The series consists of: [1/3] topology: add support for node_to_mem_node() to determine the fallback node [2/3] slub: fallback to node_to_mem_node() node if allocating on memoryless node - Joonsoo's patches to cache the nearest node with memory for each NUMA node [3/3] Partial revert of 81c98869faa5 (""kthread: ensure locality of task_struct allocations") - At Tejun's request, keep the knowledge of memoryless node fallback to the allocator core. This patch (of 3): We need to determine the fallback node in slub allocator if the allocation target node is memoryless node. Without it, the SLUB wrongly select the node which has no memory and can't use a partial slab, because of node mismatch. Introduced function, node_to_mem_node(X), will return a node Y with memory that has the nearest distance. If X is memoryless node, it will return nearest distance node, but, if X is normal node, it will return itself. We will use this function in following patch to determine the fallback node. Signed-off-by: Joonsoo Kim Signed-off-by: Nishanth Aravamudan Cc: David Rientjes Cc: Han Pingtian Cc: Pekka Enberg Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Anton Blanchard Cc: Christoph Lameter Cc: Wanpeng Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/topology.h | 17 +++++++++++++++++ mm/page_alloc.c | 1 + 2 files changed, 18 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/include/linux/topology.h b/include/linux/topology.h index dda6ee521e7..909b6e43b69 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -119,11 +119,20 @@ static inline int numa_node_id(void) * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); +extern int _node_numa_mem_[MAX_NUMNODES]; #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); + _node_numa_mem_[numa_node_id()] = node; +} +#endif + +#ifndef node_to_mem_node +static inline int node_to_mem_node(int node) +{ + return _node_numa_mem_[node]; } #endif @@ -146,6 +155,7 @@ static inline int cpu_to_mem(int cpu) static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; + _node_numa_mem_[cpu_to_node(cpu)] = node; } #endif @@ -159,6 +169,13 @@ static inline int numa_mem_id(void) } #endif +#ifndef node_to_mem_node +static inline int node_to_mem_node(int node) +{ + return node; +} +#endif + #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee96195802..f3bc59f2ed5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -85,6 +85,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node); */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); +int _node_numa_mem_[MAX_NUMNODES]; #endif /* -- cgit v1.2.3-70-g09d2 From 21bb9bd19430a43e6462ce75030fd7fac4b766ef Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:26:51 -0700 Subject: mm: page_alloc: determine migratetype only once The check for ALLOC_CMA in __alloc_pages_nodemask() derives migratetype from gfp_mask in each retry pass, although the migratetype variable already has the value determined and it does not change. Use the variable and perform the check only once. Also convert #ifdef CONFIG_CMA to IS_ENABLED. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: "Srivatsa S. Bhat" Cc: Hugh Dickins Cc: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3bc59f2ed5..e63bf7744a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2776,6 +2776,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2787,10 +2790,6 @@ retry_cpuset: goto out; classzone_idx = zonelist_zone_idx(preferred_zoneref); -#ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, -- cgit v1.2.3-70-g09d2 From 53853e2d2bfb748a8b5aa2fd1de15699266865e0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:27:02 -0700 Subject: mm, compaction: defer each zone individually instead of preferred zone When direct sync compaction is often unsuccessful, it may become deferred for some time to avoid further useless attempts, both sync and async. Successful high-order allocations un-defer compaction, while further unsuccessful compaction attempts prolong the compaction deferred period. Currently the checking and setting deferred status is performed only on the preferred zone of the allocation that invoked direct compaction. But compaction itself is attempted on all eligible zones in the zonelist, so the behavior is suboptimal and may lead both to scenarios where 1) compaction is attempted uselessly, or 2) where it's not attempted despite good chances of succeeding, as shown on the examples below: 1) A direct compaction with Normal preferred zone failed and set deferred compaction for the Normal zone. Another unrelated direct compaction with DMA32 as preferred zone will attempt to compact DMA32 zone even though the first compaction attempt also included DMA32 zone. In another scenario, compaction with Normal preferred zone failed to compact Normal zone, but succeeded in the DMA32 zone, so it will not defer compaction. In the next attempt, it will try Normal zone which will fail again, instead of skipping Normal zone and trying DMA32 directly. 2) Kswapd will balance DMA32 zone and reset defer status based on watermarks looking good. A direct compaction with preferred Normal zone will skip compaction of all zones including DMA32 because Normal was still deferred. The allocation might have succeeded in DMA32, but won't. This patch makes compaction deferring work on individual zone basis instead of preferred zone. For each zone, it checks compaction_deferred() to decide if the zone should be skipped. If watermarks fail after compacting the zone, defer_compaction() is called. The zone where watermarks passed can still be deferred when the allocation attempt is unsuccessful. When allocation is successful, compaction_defer_reset() is called for the zone containing the allocated page. This approach should approximate calling defer_compaction() only on zones where compaction was attempted and did not yield allocated page. There might be corner cases but that is inevitable as long as the decision to stop compacting dues not guarantee that a page will be allocated. Due to a new COMPACT_DEFERRED return value, some functions relying implicitly on COMPACT_SKIPPED = 0 had to be updated, with comments made more accurate. The did_some_progress output parameter of __alloc_pages_direct_compact() is removed completely, as the caller actually does not use it after compaction sets it - it is only considered when direct reclaim sets it. During testing on a two-node machine with a single very small Normal zone on node 1, this patch has improved success rates in stress-highalloc mmtests benchmark. The success here were previously made worse by commit 3a025760fc15 ("mm: page_alloc: spill to remote nodes before waking kswapd") as kswapd was no longer resetting often enough the deferred compaction for the Normal zone, and DMA32 zones on both nodes were thus not considered for compaction. On different machine, success rates were improved with __GFP_NO_KSWAPD allocations. [akpm@linux-foundation.org: fix CONFIG_COMPACTION=n build] Signed-off-by: Vlastimil Babka Acked-by: Minchan Kim Reviewed-by: Zhang Yanfei Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 16 ++++++++----- mm/compaction.c | 32 ++++++++++++++++++++------ mm/page_alloc.c | 57 +++++++++++++++++++++++++--------------------- mm/vmscan.c | 14 ++++++++---- 4 files changed, 76 insertions(+), 43 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 01e3132820d..b2e4c92d044 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -2,14 +2,16 @@ #define _LINUX_COMPACTION_H /* Return values for compact_zone() and try_to_compact_pages() */ +/* compaction didn't start as it was deferred due to past failures */ +#define COMPACT_DEFERRED 0 /* compaction didn't start as it was not possible or direct reclaim was more suitable */ -#define COMPACT_SKIPPED 0 +#define COMPACT_SKIPPED 1 /* compaction should continue to another pageblock */ -#define COMPACT_CONTINUE 1 +#define COMPACT_CONTINUE 2 /* direct compaction partially compacted a zone and there are suitable pages */ -#define COMPACT_PARTIAL 2 +#define COMPACT_PARTIAL 3 /* The full zone was compacted */ -#define COMPACT_COMPLETE 3 +#define COMPACT_COMPLETE 4 #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; @@ -22,7 +24,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - enum migrate_mode mode, bool *contended); + enum migrate_mode mode, bool *contended, + struct zone **candidate_zone); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -91,7 +94,8 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended) + enum migrate_mode mode, bool *contended, + struct zone **candidate_zone) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 21bf292b642..1c7195d42e8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1125,27 +1125,26 @@ int sysctl_extfrag_threshold = 500; * @nodemask: The allowed nodes to allocate from * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention - * @page: Optionally capture a free page of the requested order during compaction + * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended) + enum migrate_mode mode, bool *contended, + struct zone **candidate_zone) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_SKIPPED; + int rc = COMPACT_DEFERRED; int alloc_flags = 0; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) - return rc; - - count_compact_event(COMPACTSTALL); + return COMPACT_SKIPPED; #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -1156,14 +1155,33 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; + if (compaction_deferred(zone, order)) + continue; + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) + alloc_flags)) { + *candidate_zone = zone; + /* + * We think the allocation will succeed in this zone, + * but it is not certain, hence the false. The caller + * will repeat this with true if allocation indeed + * succeeds in this zone. + */ + compaction_defer_reset(zone, order, false); break; + } else if (mode != MIGRATE_ASYNC) { + /* + * We think that allocation won't succeed in this zone + * so we defer compaction there. If it ends up + * succeeding after all, it will be reset. + */ + defer_compaction(zone, order); + } } return rc; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e63bf7744a0..514fd800811 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2297,24 +2297,28 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int classzone_idx, int migratetype, enum migrate_mode mode, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + bool *contended_compaction, bool *deferred_compaction) { - if (!order) - return NULL; + struct zone *last_compact_zone = NULL; + unsigned long compact_result; - if (compaction_deferred(preferred_zone, order)) { - *deferred_compaction = true; + + if (!order) return NULL; - } current->flags |= PF_MEMALLOC; - *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, - contended_compaction); + contended_compaction, + &last_compact_zone); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { + if (compact_result > COMPACT_DEFERRED) + count_vm_event(COMPACTSTALL); + else + *deferred_compaction = true; + + if (compact_result > COMPACT_SKIPPED) { struct page *page; /* Page migration frees to the PCP lists but we want merging */ @@ -2325,13 +2329,24 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, classzone_idx, migratetype); + if (page) { - preferred_zone->compact_blockskip_flush = false; - compaction_defer_reset(preferred_zone, order, true); + struct zone *zone = page_zone(page); + + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } + /* + * last_compact_zone is where try_to_compact_pages thought + * allocation should succeed, so it did not defer compaction. + * But now we know that it didn't succeed, so we do the defer. + */ + if (last_compact_zone && mode != MIGRATE_ASYNC) + defer_compaction(last_compact_zone, order); + /* * It's bad if compaction run occurs and fails. * The most likely reason is that pages exist, @@ -2339,13 +2354,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); - /* - * As async compaction considers a subset of pageblocks, only - * defer if the failure was a sync compaction failure. - */ - if (mode != MIGRATE_ASYNC) - defer_compaction(preferred_zone, order); - cond_resched(); } @@ -2356,9 +2364,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, - enum migrate_mode mode, bool *contended_compaction, - bool *deferred_compaction, unsigned long *did_some_progress) + int classzone_idx, int migratetype, enum migrate_mode mode, + bool *contended_compaction, bool *deferred_compaction) { return NULL; } @@ -2634,8 +2641,7 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; @@ -2727,8 +2733,7 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 2836b5373b2..1a71b8b1ea3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) return reclaimable; } -/* Returns true if compaction should go ahead for a high-order request */ +/* + * Returns true if compaction should go ahead for a high-order request, or + * the high-order allocation would succeed without compaction. + */ static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; @@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order) if (compaction_deferred(zone, order)) return watermark_ok; - /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, order)) + /* + * If compaction is not ready to start and allocation is not likely + * to succeed without it, then keep reclaiming. + */ + if (compaction_suitable(zone, order) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2818,7 +2824,7 @@ static bool zone_balanced(struct zone *zone, int order, return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && - !compaction_suitable(zone, order)) + compaction_suitable(zone, order) == COMPACT_SKIPPED) return false; return true; -- cgit v1.2.3-70-g09d2 From 98dd3b48a7b8e8277f14c2b7d879477efc1ed0d0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:27:04 -0700 Subject: mm, compaction: do not count compact_stall if all zones skipped compaction The compact_stall vmstat counter counts the number of allocations stalled by direct compaction. It does not count when all attempted zones had deferred compaction, but it does count when all zones skipped compaction. The skipping is decided based on very early check of compaction_suitable(), based on watermarks and memory fragmentation. Therefore it makes sense not to count skipped compactions as stalls. Moreover, compact_success or compact_fail is also already not being counted when compaction was skipped, so this patch changes the compact_stall counting to match the other two. Additionally, restructure __alloc_pages_direct_compact() code for better readability. Signed-off-by: Vlastimil Babka Cc: Minchan Kim Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 76 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 35 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 514fd800811..822babd808f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2301,7 +2301,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, { struct zone *last_compact_zone = NULL; unsigned long compact_result; - + struct page *page; if (!order) return NULL; @@ -2313,49 +2313,55 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, &last_compact_zone); current->flags &= ~PF_MEMALLOC; - if (compact_result > COMPACT_DEFERRED) - count_vm_event(COMPACTSTALL); - else + switch (compact_result) { + case COMPACT_DEFERRED: *deferred_compaction = true; + /* fall-through */ + case COMPACT_SKIPPED: + return NULL; + default: + break; + } - if (compact_result > COMPACT_SKIPPED) { - struct page *page; + /* + * At least in one zone compaction wasn't deferred or skipped, so let's + * count a compaction stall + */ + count_vm_event(COMPACTSTALL); - /* Page migration frees to the PCP lists but we want merging */ - drain_pages(get_cpu()); - put_cpu(); + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, classzone_idx, migratetype); - if (page) { - struct zone *zone = page_zone(page); + if (page) { + struct zone *zone = page_zone(page); - zone->compact_blockskip_flush = false; - compaction_defer_reset(zone, order, true); - count_vm_event(COMPACTSUCCESS); - return page; - } + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; + } - /* - * last_compact_zone is where try_to_compact_pages thought - * allocation should succeed, so it did not defer compaction. - * But now we know that it didn't succeed, so we do the defer. - */ - if (last_compact_zone && mode != MIGRATE_ASYNC) - defer_compaction(last_compact_zone, order); + /* + * last_compact_zone is where try_to_compact_pages thought allocation + * should succeed, so it did not defer compaction. But here we know + * that it didn't succeed, so we do the defer. + */ + if (last_compact_zone && mode != MIGRATE_ASYNC) + defer_compaction(last_compact_zone, order); - /* - * It's bad if compaction run occurs and fails. - * The most likely reason is that pages exist, - * but not enough to satisfy watermarks. - */ - count_vm_event(COMPACTFAIL); + /* + * It's bad if compaction run occurs and fails. The most likely reason + * is that pages exist, but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); - cond_resched(); - } + cond_resched(); return NULL; } -- cgit v1.2.3-70-g09d2 From edc2ca61249679298c1f343cd9c549964b8df4b4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:27:09 -0700 Subject: mm, compaction: move pageblock checks up from isolate_migratepages_range() isolate_migratepages_range() is the main function of the compaction scanner, called either on a single pageblock by isolate_migratepages() during regular compaction, or on an arbitrary range by CMA's __alloc_contig_migrate_range(). It currently perfoms two pageblock-wide compaction suitability checks, and because of the CMA callpath, it tracks if it crossed a pageblock boundary in order to repeat those checks. However, closer inspection shows that those checks are always true for CMA: - isolation_suitable() is true because CMA sets cc->ignore_skip_hint to true - migrate_async_suitable() check is skipped because CMA uses sync compaction We can therefore move the compaction-specific checks to isolate_migratepages() and simplify isolate_migratepages_range(). Furthermore, we can mimic the freepage scanner family of functions, which has isolate_freepages_block() function called both by compaction from isolate_freepages() and by CMA from isolate_freepages_range(), where each use-case adds own specific glue code. This allows further code simplification. Thus, we rename isolate_migratepages_range() to isolate_migratepages_block() and limit its functionality to a single pageblock (or its subset). For CMA, a new different isolate_migratepages_range() is created as a CMA-specific wrapper for the _block() function. The checks specific to compaction are moved to isolate_migratepages(). As part of the unification of these two families of functions, we remove the redundant zone parameter where applicable, since zone pointer is already passed in cc->zone. Furthermore, going back to compact_zone() and compact_finished() when pageblock is found unsuitable (now by isolate_migratepages()) is wasteful - the checks are meant to skip pageblocks quickly. The patch therefore also introduces a simple loop into isolate_migratepages() so that it does not return immediately on failed pageblock checks, but keeps going until isolate_migratepages_range() gets called once. Similarily to isolate_freepages(), the function periodically checks if it needs to reschedule or abort async compaction. [iamjoonsoo.kim@lge.com: fix isolated page counting bug in compaction] Signed-off-by: Vlastimil Babka Cc: Minchan Kim Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 254 ++++++++++++++++++++++++++++++++------------------------ mm/internal.h | 4 +- mm/page_alloc.c | 3 +- 3 files changed, 149 insertions(+), 112 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/compaction.c b/mm/compaction.c index 7bf150d4e1c..8058e3f98f0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -132,7 +132,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool set_unsuitable, bool migrate_scanner) + bool migrate_scanner) { struct zone *zone = cc->zone; unsigned long pfn; @@ -146,12 +146,7 @@ static void update_pageblock_skip(struct compact_control *cc, if (nr_isolated) return; - /* - * Only skip pageblocks when all forms of compaction will be known to - * fail in the near future. - */ - if (set_unsuitable) - set_pageblock_skip(page); + set_pageblock_skip(page); pfn = page_to_pfn(page); @@ -180,7 +175,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool set_unsuitable, bool migrate_scanner) + bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -348,8 +343,7 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, true, - false); + update_pageblock_skip(cc, valid_page, total_isolated, false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -420,22 +414,19 @@ isolate_freepages_range(struct compact_control *cc, } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) +static void acct_isolated(struct zone *zone, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; + if (list_empty(&cc->migratepages)) + return; + list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - /* If locked we can use the interrupt unsafe versions */ - if (locked) { - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); - } else { - mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); - } + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -454,40 +445,34 @@ static bool too_many_isolated(struct zone *zone) } /** - * isolate_migratepages_range() - isolate all migrate-able pages in range. - * @zone: Zone pages are in. + * isolate_migratepages_block() - isolate all migrate-able pages within + * a single pageblock * @cc: Compaction control structure. - * @low_pfn: The first PFN of the range. - * @end_pfn: The one-past-the-last PFN of the range. - * @unevictable: true if it allows to isolate unevictable pages + * @low_pfn: The first PFN to isolate + * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock + * @isolate_mode: Isolation mode to be used. * * Isolate all pages that can be migrated from the range specified by - * [low_pfn, end_pfn). Returns zero if there is a fatal signal - * pending), otherwise PFN of the first page that was not scanned - * (which may be both less, equal to or more then end_pfn). + * [low_pfn, end_pfn). The range is expected to be within same pageblock. + * Returns zero if there is a fatal signal pending, otherwise PFN of the + * first page that was not scanned (which may be both less, equal to or more + * than end_pfn). * - * Assumes that cc->migratepages is empty and cc->nr_migratepages is - * zero. - * - * Apart from cc->migratepages and cc->nr_migratetypes this function - * does not modify any cc's fields, in particular it does not modify - * (or read for that matter) cc->migrate_pfn. + * The pages are isolated on cc->migratepages list (not required to be empty), + * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field + * is neither read nor updated. */ -unsigned long -isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn, bool unevictable) +static unsigned long +isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + unsigned long end_pfn, isolate_mode_t isolate_mode) { - unsigned long last_pageblock_nr = 0, pageblock_nr; + struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool set_unsuitable = true; - const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? - ISOLATE_ASYNC_MIGRATE : 0) | - (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -518,19 +503,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, } } - /* - * migrate_pfn does not necessarily start aligned to a - * pageblock. Ensure that pfn_valid is called when moving - * into a new MAX_ORDER_NR_PAGES range in case of large - * memory holes within the zone - */ - if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { - if (!pfn_valid(low_pfn)) { - low_pfn += MAX_ORDER_NR_PAGES - 1; - continue; - } - } - if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; @@ -548,28 +520,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!valid_page) valid_page = page; - /* If isolation recently failed, do not retry */ - pageblock_nr = low_pfn >> pageblock_order; - if (last_pageblock_nr != pageblock_nr) { - int mt; - - last_pageblock_nr = pageblock_nr; - if (!isolation_suitable(cc, page)) - goto next_pageblock; - - /* - * For async migration, also only scan in MOVABLE - * blocks. Async migration is optimistic to see if - * the minimum amount of work satisfies the allocation - */ - mt = get_pageblock_migratetype(page); - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(mt)) { - set_unsuitable = false; - goto next_pageblock; - } - } - /* * Skip if free. page_order cannot be used without zone->lock * as nothing prevents parallel allocations or buddy merging. @@ -604,8 +554,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (PageTransHuge(page)) { if (!locked) - goto next_pageblock; - low_pfn += (1 << compound_order(page)) - 1; + low_pfn = ALIGN(low_pfn + 1, + pageblock_nr_pages) - 1; + else + low_pfn += (1 << compound_order(page)) - 1; + continue; } @@ -635,7 +588,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ - if (__isolate_lru_page(page, mode) != 0) + if (__isolate_lru_page(page, isolate_mode) != 0) continue; VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -654,15 +607,8 @@ isolate_success: ++low_pfn; break; } - - continue; - -next_pageblock: - low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; } - acct_isolated(zone, locked, cc); - if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -671,8 +617,7 @@ next_pageblock: * if the whole pageblock was scanned without isolating any page. */ if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, - set_unsuitable, true); + update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -683,15 +628,63 @@ next_pageblock: return low_pfn; } +/** + * isolate_migratepages_range() - isolate migrate-able pages in a PFN range + * @cc: Compaction control structure. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Returns zero if isolation fails fatally due to e.g. pending signal. + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater than end_pfn if end fell in a middle of a THP page). + */ +unsigned long +isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn, block_end_pfn; + + /* Scan block by block. First and last block may be incomplete */ + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, end_pfn); + + /* Skip whole pageblock in case of a memory hole */ + if (!pfn_valid(pfn)) + continue; + + pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); + + /* + * In case of fatal failure, release everything that might + * have been isolated in the previous iteration, and signal + * the failure back to caller. + */ + if (!pfn) { + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + break; + } + } + acct_isolated(cc->zone, cc); + + return pfn; +} + #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ -static void isolate_freepages(struct zone *zone, - struct compact_control *cc) +static void isolate_freepages(struct compact_control *cc) { + struct zone *zone = cc->zone; struct page *page; unsigned long block_start_pfn; /* start of current pageblock */ unsigned long block_end_pfn; /* end of current pageblock */ @@ -809,7 +802,7 @@ static struct page *compaction_alloc(struct page *migratepage, */ if (list_empty(&cc->freepages)) { if (!cc->contended) - isolate_freepages(cc->zone, cc); + isolate_freepages(cc); if (list_empty(&cc->freepages)) return NULL; @@ -843,34 +836,82 @@ typedef enum { } isolate_migrate_t; /* - * Isolate all pages that can be migrated from the block pointed to by - * the migrate scanner within compact_control. + * Isolate all pages that can be migrated from the first suitable block, + * starting at the block pointed to by the migrate scanner pfn within + * compact_control. */ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + struct page *page; + const isolate_mode_t isolate_mode = + (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); - /* Do not scan outside zone boundaries */ - low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + /* + * Start at where we last stopped, or beginning of the zone as + * initialized by compact_zone() + */ + low_pfn = cc->migrate_pfn; /* Only scan within a pageblock boundary */ end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); - /* Do not cross the free scanner or scan within a memory hole */ - if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { - cc->migrate_pfn = end_pfn; - return ISOLATE_NONE; - } + /* + * Iterate over whole pageblocks until we find the first suitable. + * Do not cross the free scanner. + */ + for (; end_pfn <= cc->free_pfn; + low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { - /* Perform the isolation */ - low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); - if (!low_pfn || cc->contended) - return ISOLATE_ABORT; + /* + * This can potentially iterate a massively long zone with + * many pageblocks unsuitable, so periodically check if we + * need to schedule, or even abort async compaction. + */ + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; + + /* Skip whole pageblock in case of a memory hole */ + if (!pfn_valid(low_pfn)) + continue; + page = pfn_to_page(low_pfn); + + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + + /* + * For async compaction, also only scan in MOVABLE blocks. + * Async compaction is optimistic to see if the minimum amount + * of work satisfies the allocation. + */ + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(get_pageblock_migratetype(page))) + continue; + + /* Perform the isolation */ + low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, + isolate_mode); + + if (!low_pfn || cc->contended) + return ISOLATE_ABORT; + + /* + * Either we isolated something and proceed with migration. Or + * we failed and compact_zone should decide if we should + * continue or not. + */ + break; + } + + acct_isolated(zone, cc); + /* Record where migration scanner will be restarted */ cc->migrate_pfn = low_pfn; - return ISOLATE_SUCCESS; + return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } static int compact_finished(struct zone *zone, @@ -1043,9 +1084,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - if (!cc->nr_migratepages) - continue; - err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); diff --git a/mm/internal.h b/mm/internal.h index a1b651b11c5..5a0738fa649 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -154,8 +154,8 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); unsigned long -isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn, bool unevictable); +isolate_migratepages_range(struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822babd808f..dfbf54b5164 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6288,8 +6288,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc->zone, cc, - pfn, end, true); + pfn = isolate_migratepages_range(cc, pfn, end); if (!pfn) { ret = -EINTR; break; -- cgit v1.2.3-70-g09d2 From 1f9efdef4f3f1d2a073e524113fd0038af636f2b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:27:14 -0700 Subject: mm, compaction: khugepaged should not give up due to need_resched() Async compaction aborts when it detects zone lock contention or need_resched() is true. David Rientjes has reported that in practice, most direct async compactions for THP allocation abort due to need_resched(). This means that a second direct compaction is never attempted, which might be OK for a page fault, but khugepaged is intended to attempt a sync compaction in such case and in these cases it won't. This patch replaces "bool contended" in compact_control with an int that distinguishes between aborting due to need_resched() and aborting due to lock contention. This allows propagating the abort through all compaction functions as before, but passing the abort reason up to __alloc_pages_slowpath() which decides when to continue with direct reclaim and another compaction attempt. Another problem is that try_to_compact_pages() did not act upon the reported contention (both need_resched() or lock contention) immediately and would proceed with another zone from the zonelist. When need_resched() is true, that means initializing another zone compaction, only to check again need_resched() in isolate_migratepages() and aborting. For zone lock contention, the unintended consequence is that the lock contended status reported back to the allocator is detrmined from the last zone where compaction was attempted, which is rather arbitrary. This patch fixes the problem in the following way: - async compaction of a zone aborting due to need_resched() or fatal signal pending means that further zones should not be tried. We report COMPACT_CONTENDED_SCHED to the allocator. - aborting zone compaction due to lock contention means we can still try another zone, since it has different set of locks. We report back COMPACT_CONTENDED_LOCK only if *all* zones where compaction was attempted, it was aborted due to lock contention. As a result of these fixes, khugepaged will proceed with second sync compaction as intended, when the preceding async compaction aborted due to need_resched(). Page fault compactions aborting due to need_resched() will spare some cycles previously wasted by initializing another zone compaction only to abort again. Lock contention will be reported only when compaction in all zones aborted due to lock contention, and therefore it's not a good idea to try again after reclaim. In stress-highalloc from mmtests configured to use __GFP_NO_KSWAPD, this has improved number of THP collapse allocations by 10%, which shows positive effect on khugepaged. The benchmark's success rates are unchanged as it is not recognized as khugepaged. Numbers of compact_stall and compact_fail events have however decreased by 20%, with compact_success still a bit improved, which is good. With benchmark configured not to use __GFP_NO_KSWAPD, there is 6% improvement in THP collapse allocations, and only slight improvement in stalls and failures. [akpm@linux-foundation.org: fix warnings] Reported-by: David Rientjes Signed-off-by: Vlastimil Babka Cc: Minchan Kim Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 12 +++++-- mm/compaction.c | 87 ++++++++++++++++++++++++++++++++++++++++------ mm/internal.h | 4 +-- mm/page_alloc.c | 45 +++++++++++++++++------- 4 files changed, 121 insertions(+), 27 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index b2e4c92d044..60bdf8dc02a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -13,6 +13,14 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 4 +/* Used to signal whether compaction detected need_sched() or lock contention */ +/* No contention detected */ +#define COMPACT_CONTENDED_NONE 0 +/* Either need_sched() was true or fatal signal pending */ +#define COMPACT_CONTENDED_SCHED 1 +/* Zone lock or lru_lock was contended in async compaction */ +#define COMPACT_CONTENDED_LOCK 2 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -24,7 +32,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - enum migrate_mode mode, bool *contended, + enum migrate_mode mode, int *contended, struct zone **candidate_zone); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); @@ -94,7 +102,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended, + enum migrate_mode mode, int *contended, struct zone **candidate_zone) { return COMPACT_CONTINUE; diff --git a/mm/compaction.c b/mm/compaction.c index 1067c07cb33..26bb20ef853 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -223,9 +223,21 @@ static void update_pageblock_skip(struct compact_control *cc, } #endif /* CONFIG_COMPACTION */ -static inline bool should_release_lock(spinlock_t *lock) +static int should_release_lock(spinlock_t *lock) { - return need_resched() || spin_is_contended(lock); + /* + * Sched contention has higher priority here as we may potentially + * have to abort whole compaction ASAP. Returning with lock contention + * means we will try another zone, and further decisions are + * influenced only when all zones are lock contended. That means + * potentially missing a lock contention is less critical. + */ + if (need_resched()) + return COMPACT_CONTENDED_SCHED; + else if (spin_is_contended(lock)) + return COMPACT_CONTENDED_LOCK; + + return COMPACT_CONTENDED_NONE; } /* @@ -240,7 +252,9 @@ static inline bool should_release_lock(spinlock_t *lock) static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, bool locked, struct compact_control *cc) { - if (should_release_lock(lock)) { + int contended = should_release_lock(lock); + + if (contended) { if (locked) { spin_unlock_irqrestore(lock, *flags); locked = false; @@ -248,7 +262,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, /* async aborts if taking too long or contended */ if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; + cc->contended = contended; return false; } @@ -274,7 +288,7 @@ static inline bool compact_should_abort(struct compact_control *cc) /* async compaction aborts if contended */ if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; + cc->contended = COMPACT_CONTENDED_SCHED; return true; } @@ -1140,7 +1154,7 @@ out: } static unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, bool *contended) + gfp_t gfp_mask, enum migrate_mode mode, int *contended) { unsigned long ret; struct compact_control cc = { @@ -1172,14 +1186,15 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @mode: The migration mode for async, sync light, or sync migration - * @contended: Return value that is true if compaction was aborted due to lock contention + * @contended: Return value that determines if compaction was aborted due to + * need_resched() or lock contention * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended, + enum migrate_mode mode, int *contended, struct zone **candidate_zone) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); @@ -1189,6 +1204,9 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zone *zone; int rc = COMPACT_DEFERRED; int alloc_flags = 0; + int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ + + *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) @@ -1202,13 +1220,19 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { int status; + int zone_contended; if (compaction_deferred(zone, order)) continue; status = compact_zone_order(zone, order, gfp_mask, mode, - contended); + &zone_contended); rc = max(status, rc); + /* + * It takes at least one zone that wasn't lock contended + * to clear all_zones_contended. + */ + all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, @@ -1221,8 +1245,21 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * succeeds in this zone. */ compaction_defer_reset(zone, order, false); - break; - } else if (mode != MIGRATE_ASYNC) { + /* + * It is possible that async compaction aborted due to + * need_resched() and the watermarks were ok thanks to + * somebody else freeing memory. The allocation can + * however still fail so we better signal the + * need_resched() contention anyway (this will not + * prevent the allocation attempt). + */ + if (zone_contended == COMPACT_CONTENDED_SCHED) + *contended = COMPACT_CONTENDED_SCHED; + + goto break_loop; + } + + if (mode != MIGRATE_ASYNC) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up @@ -1230,8 +1267,36 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, */ defer_compaction(zone, order); } + + /* + * We might have stopped compacting due to need_resched() in + * async compaction, or due to a fatal signal detected. In that + * case do not try further zones and signal need_resched() + * contention. + */ + if ((zone_contended == COMPACT_CONTENDED_SCHED) + || fatal_signal_pending(current)) { + *contended = COMPACT_CONTENDED_SCHED; + goto break_loop; + } + + continue; +break_loop: + /* + * We might not have tried all the zones, so be conservative + * and assume they are not all lock contended. + */ + all_zones_contended = 0; + break; } + /* + * If at least one zone wasn't deferred or skipped, we report if all + * zones that were tried were lock contended. + */ + if (rc > COMPACT_SKIPPED && all_zones_contended) + *contended = COMPACT_CONTENDED_LOCK; + return rc; } diff --git a/mm/internal.h b/mm/internal.h index 5a0738fa649..4c1d604c396 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -144,8 +144,8 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended, or - * need_resched() true during async + int contended; /* Signal need_sched() or lock + * contention detected during * compaction */ }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dfbf54b5164..313338d7409 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2297,7 +2297,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int classzone_idx, int migratetype, enum migrate_mode mode, - bool *contended_compaction, bool *deferred_compaction) + int *contended_compaction, bool *deferred_compaction) { struct zone *last_compact_zone = NULL; unsigned long compact_result; @@ -2371,7 +2371,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int classzone_idx, int migratetype, enum migrate_mode mode, - bool *contended_compaction, bool *deferred_compaction) + int *contended_compaction, bool *deferred_compaction) { return NULL; } @@ -2547,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; - bool contended_compaction = false; + int contended_compaction = COMPACT_CONTENDED_NONE; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2651,15 +2651,36 @@ rebalance: if (page) goto got_pg; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) - goto nopage; + /* Checks for THP-specific high-order allocations */ + if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a THP allocation, we do not want + * to heavily disrupt the system, so we fail the allocation + * instead of entering direct reclaim. + */ + if (deferred_compaction) + goto nopage; + + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + goto nopage; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + goto nopage; + } /* * It can become very expensive to allocate transparent hugepages at -- cgit v1.2.3-70-g09d2 From 43e7a34d265e884b7cf34f9b05e6f2e0c05bf120 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 9 Oct 2014 15:27:25 -0700 Subject: mm: rename allocflags_to_migratetype for clarity The page allocator has gfp flags (like __GFP_WAIT) and alloc flags (like ALLOC_CPUSET) that have separate semantics. The function allocflags_to_migratetype() actually takes gfp flags, not alloc flags, and returns a migratetype. Rename it to gfpflags_to_migratetype(). Signed-off-by: David Rientjes Signed-off-by: Vlastimil Babka Reviewed-by: Zhang Yanfei Reviewed-by: Naoya Horiguchi Acked-by: Minchan Kim Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- mm/compaction.c | 4 ++-- mm/page_alloc.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5e7219dc0fa..41b30fd4d04 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -156,7 +156,7 @@ struct vm_area_struct; #define GFP_DMA32 __GFP_DMA32 /* Convert GFP flags to their corresponding migrate type */ -static inline int allocflags_to_migratetype(gfp_t gfp_flags) +static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) { WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); diff --git a/mm/compaction.c b/mm/compaction.c index b9cf751cc00..7c687c0eef6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1242,7 +1242,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .nr_freepages = 0, .nr_migratepages = 0, .order = order, - .migratetype = allocflags_to_migratetype(gfp_mask), + .migratetype = gfpflags_to_migratetype(gfp_mask), .zone = zone, .mode = mode, }; @@ -1294,7 +1294,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_SKIPPED; #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif /* Compact each zone in the list */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 313338d7409..f07588b11d5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2523,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_NO_WATERMARKS; } #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -2786,7 +2786,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = allocflags_to_migratetype(gfp_mask); + int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; int classzone_idx; -- cgit v1.2.3-70-g09d2 From 0bf55139782db1fa96af66e37cc84afde18443ef Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:06 -0700 Subject: mm: introduce dump_vma Introduce a helper to dump information about a VMA, this also makes dump_page_flags more generic and re-uses that so the output looks very similar to dump_page: [ 61.903437] vma ffff88070f88be00 start 00007fff25970000 end 00007fff25992000 [ 61.903437] next ffff88070facd600 prev ffff88070face400 mm ffff88070fade000 [ 61.903437] prot 8000000000000025 anon_vma ffff88070fa1e200 vm_ops (null) [ 61.903437] pgoff 7ffffffdd file (null) private_data (null) [ 61.909129] flags: 0x100173(read|write|mayread|maywrite|mayexec|growsdown|account) [akpm@linux-foundation.org: make dump_vma() require CONFIG_DEBUG_VM] [swarren@nvidia.com: fix dump_vma() compilation] Signed-off-by: Sasha Levin Reviewed-by: Naoya Horiguchi Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: Hugh Dickins Cc: Vlastimil Babka Cc: Michel Lespinasse Cc: Minchan Kim Signed-off-by: Stephen Warren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 2 ++ mm/page_alloc.c | 82 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 75 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 2f348d02f64..dfb93333fc6 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -4,10 +4,12 @@ #include struct page; +struct vm_area_struct; extern void dump_page(struct page *page, const char *reason); extern void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags); +void dump_vma(const struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f07588b11d5..3a950144f80 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6626,27 +6626,26 @@ static const struct trace_print_flags pageflag_names[] = { #endif }; -static void dump_page_flags(unsigned long flags) +static void dump_flags(unsigned long flags, + const struct trace_print_flags *names, int count) { const char *delim = ""; unsigned long mask; int i; - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - - printk(KERN_ALERT "page flags: %#lx(", flags); + printk(KERN_ALERT "flags: %#lx(", flags); /* remove zone id */ flags &= (1UL << NR_PAGEFLAGS) - 1; - for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { + for (i = 0; i < count && flags; i++) { - mask = pageflag_names[i].mask; + mask = names[i].mask; if ((flags & mask) != mask) continue; flags &= ~mask; - printk("%s%s", delim, pageflag_names[i].name); + printk("%s%s", delim, names[i].name); delim = "|"; } @@ -6664,12 +6663,14 @@ void dump_page_badflags(struct page *page, const char *reason, "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); - dump_page_flags(page->flags); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); if (reason) pr_alert("page dumped because: %s\n", reason); if (page->flags & badflags) { pr_alert("bad because of flags:\n"); - dump_page_flags(page->flags & badflags); + dump_flags(page->flags & badflags, + pageflag_names, ARRAY_SIZE(pageflag_names)); } mem_cgroup_print_bad_page(page); } @@ -6679,3 +6680,66 @@ void dump_page(struct page *page, const char *reason) dump_page_badflags(page, reason, 0); } EXPORT_SYMBOL(dump_page); + +#ifdef CONFIG_DEBUG_VM + +static const struct trace_print_flags vmaflags_names[] = { + {VM_READ, "read" }, + {VM_WRITE, "write" }, + {VM_EXEC, "exec" }, + {VM_SHARED, "shared" }, + {VM_MAYREAD, "mayread" }, + {VM_MAYWRITE, "maywrite" }, + {VM_MAYEXEC, "mayexec" }, + {VM_MAYSHARE, "mayshare" }, + {VM_GROWSDOWN, "growsdown" }, + {VM_PFNMAP, "pfnmap" }, + {VM_DENYWRITE, "denywrite" }, + {VM_LOCKED, "locked" }, + {VM_IO, "io" }, + {VM_SEQ_READ, "seqread" }, + {VM_RAND_READ, "randread" }, + {VM_DONTCOPY, "dontcopy" }, + {VM_DONTEXPAND, "dontexpand" }, + {VM_ACCOUNT, "account" }, + {VM_NORESERVE, "noreserve" }, + {VM_HUGETLB, "hugetlb" }, + {VM_NONLINEAR, "nonlinear" }, +#if defined(CONFIG_X86) + {VM_PAT, "pat" }, +#elif defined(CONFIG_PPC) + {VM_SAO, "sao" }, +#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) + {VM_GROWSUP, "growsup" }, +#elif !defined(CONFIG_MMU) + {VM_MAPPED_COPY, "mappedcopy" }, +#else + {VM_ARCH_1, "arch_1" }, +#endif + {VM_DONTDUMP, "dontdump" }, +#ifdef CONFIG_MEM_SOFT_DIRTY + {VM_SOFTDIRTY, "softdirty" }, +#endif + {VM_MIXEDMAP, "mixedmap" }, + {VM_HUGEPAGE, "hugepage" }, + {VM_NOHUGEPAGE, "nohugepage" }, + {VM_MERGEABLE, "mergeable" }, +}; + +void dump_vma(const struct vm_area_struct *vma) +{ + printk(KERN_ALERT + "vma %p start %p end %p\n" + "next %p prev %p mm %p\n" + "prot %lx anon_vma %p vm_ops %p\n" + "pgoff %lx file %p private_data %p\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, + vma->vm_prev, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data); + dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); +} +EXPORT_SYMBOL(dump_vma); + +#endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3-70-g09d2 From 7ade3c997208566c5bf50ece8fc319a8caf0d41a Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Thu, 9 Oct 2014 15:28:12 -0700 Subject: mm: page_alloc: avoid wakeup kswapd on the unintended node When entering the page_alloc slowpath, we wakeup kswapd on every pgdat according to the zonelist and high_zoneidx. However, this doesn't take nodemask into account, and could prematurely wakeup kswapd on some unintended nodes. This patch uses for_each_zone_zonelist_nodemask() instead of for_each_zone_zonelist() in wake_all_kswapds() to avoid the above situation. Signed-off-by: Weijie Yang Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a950144f80..ae2f8474273 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2471,12 +2471,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - struct zone *preferred_zone) + struct zone *preferred_zone, + nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } @@ -2574,7 +2576,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); + wake_all_kswapds(order, zonelist, high_zoneidx, + preferred_zone, nodemask); /* * OK, we're below the kswapd watermark and have kicked background -- cgit v1.2.3-70-g09d2 From 5705465174686d007473e017b76c4b64b44aa690 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 9 Oct 2014 15:28:17 -0700 Subject: mm: clean up zone flags Page reclaim tests zone_is_reclaim_dirty(), but the site that actually sets this state does zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY), sending the reader through layers indirection just to track down a simple bit. Remove all zone flag wrappers and just use bitops against zone->flags directly. It's just as readable and the lines are barely any longer. Also rename ZONE_TAIL_LRU_DIRTY to ZONE_DIRTY to match ZONE_WRITEBACK, and remove the zone_flags_t typedef. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 51 +++----------------------------------------------- mm/backing-dev.c | 2 +- mm/oom_kill.c | 6 +++--- mm/page_alloc.c | 8 ++++---- mm/vmscan.c | 28 +++++++++++++-------------- 5 files changed, 25 insertions(+), 70 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 318df705185..48bf12ef662 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -521,13 +521,13 @@ struct zone { atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; -typedef enum { +enum zone_flags { ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ ZONE_CONGESTED, /* zone has many dirty pages backed by * a congested BDI */ - ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found + ZONE_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ @@ -535,52 +535,7 @@ typedef enum { * many pages under writeback */ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ -} zone_flags_t; - -static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) -{ - set_bit(flag, &zone->flags); -} - -static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag) -{ - return test_and_set_bit(flag, &zone->flags); -} - -static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) -{ - clear_bit(flag, &zone->flags); -} - -static inline int zone_is_reclaim_congested(const struct zone *zone) -{ - return test_bit(ZONE_CONGESTED, &zone->flags); -} - -static inline int zone_is_reclaim_dirty(const struct zone *zone) -{ - return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); -} - -static inline int zone_is_reclaim_writeback(const struct zone *zone) -{ - return test_bit(ZONE_WRITEBACK, &zone->flags); -} - -static inline int zone_is_reclaim_locked(const struct zone *zone) -{ - return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); -} - -static inline int zone_is_fair_depleted(const struct zone *zone) -{ - return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); -} - -static inline int zone_is_oom_locked(const struct zone *zone) -{ - return test_bit(ZONE_OOM_LOCKED, &zone->flags); -} +}; static inline unsigned long zone_end_pfn(const struct zone *zone) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f..b27714f1b40 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -631,7 +631,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * of sleeping on the congestion queue */ if (atomic_read(&nr_bdi_congested[sync]) == 0 || - !zone_is_reclaim_congested(zone)) { + !test_bit(ZONE_CONGESTED, &zone->flags)) { cond_resched(); /* In case we scheduled, work out time remaining */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e11df8fa7e..bbf405a3a18 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - if (zone_is_oom_locked(zone)) { + if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { ret = false; goto out; } @@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. */ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - zone_set_flag(zone, ZONE_OOM_LOCKED); + set_bit(ZONE_OOM_LOCKED, &zone->flags); out: spin_unlock(&zone_scan_lock); @@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - zone_clear_flag(zone, ZONE_OOM_LOCKED); + clear_bit(ZONE_OOM_LOCKED, &zone->flags); spin_unlock(&zone_scan_lock); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae2f8474273..f3769f0fce3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1614,8 +1614,8 @@ again: __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && - !zone_is_fair_depleted(zone)) - zone_set_flag(zone, ZONE_FAIR_DEPLETED); + !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) + set_bit(ZONE_FAIR_DEPLETED, &zone->flags); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1935,7 +1935,7 @@ static void reset_alloc_batches(struct zone *preferred_zone) mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); } while (zone++ != preferred_zone); } @@ -1986,7 +1986,7 @@ zonelist_scan: if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) break; - if (zone_is_fair_depleted(zone)) { + if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { nr_fair_skipped++; continue; } diff --git a/mm/vmscan.c b/mm/vmscan.c index af72fe8e8d7..06123f20a32 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - zone_is_reclaim_writeback(zone)) { + test_bit(ZONE_WRITEBACK, &zone->flags)) { nr_immediate++; goto keep_locked; @@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !zone_is_reclaim_dirty(zone))) { + !test_bit(ZONE_DIRTY, &zone->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - zone_set_flag(zone, ZONE_WRITEBACK); + set_bit(ZONE_WRITEBACK, &zone->flags); /* * memcg will stall in page writeback so only consider forcibly @@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - zone_set_flag(zone, ZONE_CONGESTED); + set_bit(ZONE_CONGESTED, &zone->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. + * the zone ZONE_DIRTY and kswapd will start writing pages from + * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + set_bit(ZONE_DIRTY, &zone->flags); /* * If kswapd scans pages marked marked for immediate @@ -2984,7 +2984,7 @@ static bool kswapd_shrink_zone(struct zone *zone, /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - zone_clear_flag(zone, ZONE_WRITEBACK); + clear_bit(ZONE_WRITEBACK, &zone->flags); /* * If a zone reaches its high watermark, consider it to be no longer @@ -2994,8 +2994,8 @@ static bool kswapd_shrink_zone(struct zone *zone, */ if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } return sc->nr_scanned >= sc->nr_to_reclaim; @@ -3086,8 +3086,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * If balanced, clear the dirty and congested * flags */ - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } } @@ -3714,11 +3714,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) if (node_state(node_id, N_CPU) && node_id != numa_node_id()) return ZONE_RECLAIM_NOSCAN; - if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) + if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) return ZONE_RECLAIM_NOSCAN; ret = __zone_reclaim(zone, gfp_mask, order); - zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); -- cgit v1.2.3-70-g09d2 From 97ee4ba7cbd30f1858f0d16911e042737c53f2ef Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Oct 2014 15:28:28 -0700 Subject: mm: page_alloc: Make paranoid check in move_freepages a VM_BUG_ON Since 2.6.24 there has been a paranoid check in move_freepages that looks up the zone of two pages. This is a very slow path and the only time I've seen this bug trigger recently is when memory initialisation was broken during patch development. Despite the fact it's a slow path, this patch converts the check to a VM_BUG_ON anyway as it has served its purpose by now. Signed-off-by: Mel Gorman Acked-by: David Rientjes Acked-by: Rik van Riel Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3769f0fce3..eac31a6059c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1015,7 +1015,7 @@ int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif for (page = start_page; page <= end_page;) { -- cgit v1.2.3-70-g09d2 From 3193913ce62c63056bc67a6ae378beaf494afa66 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Oct 2014 15:28:30 -0700 Subject: mm: page_alloc: default node-ordering on 64-bit NUMA, zone-ordering on 32-bit Zones are allocated by the page allocator in either node or zone order. Node ordering is preferred in terms of locality and is applied automatically in one of three cases: 1. If a node has only low memory 2. If DMA/DMA32 is a high percentage of memory 3. If low memory on a single node is greater than 70% of the node size Otherwise zone ordering is used to preserve low memory for devices that require it. Unfortunately a consequence of this is that applications running on a machine with balanced NUMA nodes will experience different performance characteristics depending on which node they happen to start from. The point of zone ordering is to protect lower zones for devices that require DMA/DMA32 memory. When NUMA was first introduced, this was critical as 32-bit NUMA machines existed and exhausting low memory triggered OOMs easily as so many allocations required low memory. On 64-bit machines the primary concern is devices that are 32-bit only which is less severe than the low memory exhaustion problem on 32-bit NUMA. It seems there are really few devices that depends on it. AGP -- I assume this is getting more rare but even then I think the allocations happen early in boot time where lowmem pressure is less of a problem DRM -- If the device is 32-bit only then there may be low pressure. I didn't evaluate these in detail but it looks like some of these are mobile graphics card. Not many NUMA laptops out there. DRM folk should know better though. Some TV cards -- Much demand for 32-bit capable TV cards on NUMA machines? B43 wireless card -- again not really a NUMA thing. I cannot find a good reason to incur a performance penalty on all 64-bit NUMA machines in case someone throws a brain damanged TV or graphics card in there. This patch defaults to node-ordering on 64-bit NUMA machines. I was tempted to make it default everywhere but I understand that some embedded arches may be using 32-bit NUMA where I cannot predict the consequences. The performance impact depends on the workload and the characteristics of the machine and the machine I tested on had a large Normal zone on node 0 so the impact is within the noise for the majority of tests. The allocation stats show more allocation requests were from DMA32 and local node. Running SpecJBB with multiple JVMs and automatic NUMA balancing disabled the results were specjbb 3.17.0-rc2 3.17.0-rc2 vanilla nodeorder-v1r1 Min 1 29534.00 ( 0.00%) 30020.00 ( 1.65%) Min 10 115717.00 ( 0.00%) 134038.00 ( 15.83%) Min 19 109718.00 ( 0.00%) 114186.00 ( 4.07%) Min 28 104459.00 ( 0.00%) 103639.00 ( -0.78%) Min 37 98245.00 ( 0.00%) 103756.00 ( 5.61%) Min 46 97198.00 ( 0.00%) 96197.00 ( -1.03%) Mean 1 30953.25 ( 0.00%) 31917.75 ( 3.12%) Mean 10 124432.50 ( 0.00%) 140904.00 ( 13.24%) Mean 19 116033.50 ( 0.00%) 119294.75 ( 2.81%) Mean 28 108365.25 ( 0.00%) 106879.50 ( -1.37%) Mean 37 102984.75 ( 0.00%) 106924.25 ( 3.83%) Mean 46 100783.25 ( 0.00%) 105368.50 ( 4.55%) Stddev 1 1260.38 ( 0.00%) 1109.66 ( 11.96%) Stddev 10 7434.03 ( 0.00%) 5171.91 ( 30.43%) Stddev 19 8453.84 ( 0.00%) 5309.59 ( 37.19%) Stddev 28 4184.55 ( 0.00%) 2906.63 ( 30.54%) Stddev 37 5409.49 ( 0.00%) 3192.12 ( 40.99%) Stddev 46 4521.95 ( 0.00%) 7392.52 (-63.48%) Max 1 32738.00 ( 0.00%) 32719.00 ( -0.06%) Max 10 136039.00 ( 0.00%) 148614.00 ( 9.24%) Max 19 130566.00 ( 0.00%) 127418.00 ( -2.41%) Max 28 115404.00 ( 0.00%) 111254.00 ( -3.60%) Max 37 112118.00 ( 0.00%) 111732.00 ( -0.34%) Max 46 108541.00 ( 0.00%) 116849.00 ( 7.65%) TPut 1 123813.00 ( 0.00%) 127671.00 ( 3.12%) TPut 10 497730.00 ( 0.00%) 563616.00 ( 13.24%) TPut 19 464134.00 ( 0.00%) 477179.00 ( 2.81%) TPut 28 433461.00 ( 0.00%) 427518.00 ( -1.37%) TPut 37 411939.00 ( 0.00%) 427697.00 ( 3.83%) TPut 46 403133.00 ( 0.00%) 421474.00 ( 4.55%) 3.17.0-rc2 3.17.0-rc2 vanillanodeorder-v1r1 DMA allocs 0 0 DMA32 allocs 57 1491992 Normal allocs 32543566 30026383 Movable allocs 0 0 Direct pages scanned 0 0 Kswapd pages scanned 0 0 Kswapd pages reclaimed 0 0 Direct pages reclaimed 0 0 Kswapd efficiency 100% 100% Kswapd velocity 0.000 0.000 Direct efficiency 100% 100% Direct velocity 0.000 0.000 Percentage direct scans 0% 0% Zone normal velocity 0.000 0.000 Zone dma32 velocity 0.000 0.000 Zone dma velocity 0.000 0.000 THP fault alloc 55164 52987 THP collapse alloc 139 147 THP splits 26 21 NUMA alloc hit 4169066 4250692 NUMA alloc miss 0 0 Note that there were more DMA32 allocations with the patch applied. In this particular case there was no difference in numa_hit and numa_miss. The expectation is that DMA32 was being used at the low watermark instead of falling into the slow path. kswapd was not woken but it's not worken for THP allocations. On 32-bit, this patch defaults to zone-ordering as low memory depletion can be a serious problem on 32-bit large memory machines. If the default ordering was node then processes on node 0 will deplete the Normal zone due to normal activity. The problem is worse if CONFIG_HIGHPTE is not set. If combined with large amounts of dirty/writeback pages in Normal zone then there is also a high risk of OOM. The heuristics are removed as it's not clear they were ever important on 32-bit. They were only relevant for setting node-ordering on 64-bit. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Cc: Rik van Riel Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 78 +++++++++++++++------------------------------------------ 1 file changed, 20 insertions(+), 58 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eac31a6059c..bfb73e025e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3614,68 +3614,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) zonelist->_zonerefs[pos].zone_idx = 0; } +#if defined(CONFIG_64BIT) +/* + * Devices that require DMA32/DMA are relatively rare and do not justify a + * penalty to every machine in case the specialised case applies. Default + * to Node-ordering on 64-bit NUMA machines + */ +static int default_zonelist_order(void) +{ + return ZONELIST_ORDER_NODE; +} +#else +/* + * On 32-bit, the Normal zone needs to be preserved for allocations accessible + * by the kernel. If processes running on node 0 deplete the low memory zone + * then reclaim will occur more frequency increasing stalls and potentially + * be easier to OOM if a large percentage of the zone is under writeback or + * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. + * Hence, default to zone ordering on 32-bit. + */ static int default_zonelist_order(void) { - int nid, zone_type; - unsigned long low_kmem_size, total_size; - struct zone *z; - int average_size; - /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. - * If they are really small and used heavily, the system can fall - * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and configures zone order. - */ - /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ - low_kmem_size = 0; - total_size = 0; - for_each_online_node(nid) { - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->managed_pages; - total_size += z->managed_pages; - } else if (zone_type == ZONE_NORMAL) { - /* - * If any node has only lowmem, then node order - * is preferred to allow kernel allocations - * locally; otherwise, they can easily infringe - * on other nodes when there is an abundance of - * lowmem available to allocate from. - */ - return ZONELIST_ORDER_NODE; - } - } - } - if (!low_kmem_size || /* there are no DMA area. */ - low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ - return ZONELIST_ORDER_NODE; - /* - * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ - average_size = total_size / - (nodes_weight(node_states[N_MEMORY]) + 1); - for_each_online_node(nid) { - low_kmem_size = 0; - total_size = 0; - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; - } - } - if (low_kmem_size && - total_size > average_size && /* ignore small node */ - low_kmem_size > total_size * 70/100) - return ZONELIST_ORDER_NODE; - } return ZONELIST_ORDER_ZONE; } +#endif /* CONFIG_64BIT */ static void set_zonelist_order(void) { -- cgit v1.2.3-70-g09d2 From 82742a3a5152195edd69528c0c9a1a6fb9caa293 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:34 -0700 Subject: mm: move debug code out of page_alloc.c dump_page() and dump_vma() are not specific to page_alloc.c, move them out so page_alloc.c won't turn into the unofficial debug repository. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/debug.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 160 ------------------------------------------------------- 3 files changed, 163 insertions(+), 161 deletions(-) create mode 100644 mm/debug.c (limited to 'mm/page_alloc.c') diff --git a/mm/Makefile b/mm/Makefile index fe7a053c0f4..f8ed7ab417b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -18,7 +18,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o balloon_compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - iov_iter.o $(mmu-y) + iov_iter.o debug.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/debug.c b/mm/debug.c new file mode 100644 index 00000000000..697df905019 --- /dev/null +++ b/mm/debug.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include + +static const struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, +#endif +}; + +static void dump_flags(unsigned long flags, + const struct trace_print_flags *names, int count) +{ + const char *delim = ""; + unsigned long mask; + int i; + + printk(KERN_ALERT "flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; i < count && flags; i++) { + + mask = names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + printk("%s%s", delim, names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + printk("%s%#lx", delim, flags); + + printk(")\n"); +} + +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) +{ + printk(KERN_ALERT + "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, atomic_read(&page->_count), page_mapcount(page), + page->mapping, page->index); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + if (reason) + pr_alert("page dumped because: %s\n", reason); + if (page->flags & badflags) { + pr_alert("bad because of flags:\n"); + dump_flags(page->flags & badflags, + pageflag_names, ARRAY_SIZE(pageflag_names)); + } + mem_cgroup_print_bad_page(page); +} + +void dump_page(struct page *page, const char *reason) +{ + dump_page_badflags(page, reason, 0); +} +EXPORT_SYMBOL(dump_page); + +#ifdef CONFIG_DEBUG_VM + +static const struct trace_print_flags vmaflags_names[] = { + {VM_READ, "read" }, + {VM_WRITE, "write" }, + {VM_EXEC, "exec" }, + {VM_SHARED, "shared" }, + {VM_MAYREAD, "mayread" }, + {VM_MAYWRITE, "maywrite" }, + {VM_MAYEXEC, "mayexec" }, + {VM_MAYSHARE, "mayshare" }, + {VM_GROWSDOWN, "growsdown" }, + {VM_PFNMAP, "pfnmap" }, + {VM_DENYWRITE, "denywrite" }, + {VM_LOCKED, "locked" }, + {VM_IO, "io" }, + {VM_SEQ_READ, "seqread" }, + {VM_RAND_READ, "randread" }, + {VM_DONTCOPY, "dontcopy" }, + {VM_DONTEXPAND, "dontexpand" }, + {VM_ACCOUNT, "account" }, + {VM_NORESERVE, "noreserve" }, + {VM_HUGETLB, "hugetlb" }, + {VM_NONLINEAR, "nonlinear" }, +#if defined(CONFIG_X86) + {VM_PAT, "pat" }, +#elif defined(CONFIG_PPC) + {VM_SAO, "sao" }, +#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) + {VM_GROWSUP, "growsup" }, +#elif !defined(CONFIG_MMU) + {VM_MAPPED_COPY, "mappedcopy" }, +#else + {VM_ARCH_1, "arch_1" }, +#endif + {VM_DONTDUMP, "dontdump" }, +#ifdef CONFIG_MEM_SOFT_DIRTY + {VM_SOFTDIRTY, "softdirty" }, +#endif + {VM_MIXEDMAP, "mixedmap" }, + {VM_HUGEPAGE, "hugepage" }, + {VM_NOHUGEPAGE, "nohugepage" }, + {VM_MERGEABLE, "mergeable" }, +}; + +void dump_vma(const struct vm_area_struct *vma) +{ + printk(KERN_ALERT + "vma %p start %p end %p\n" + "next %p prev %p mm %p\n" + "prot %lx anon_vma %p vm_ops %p\n" + "pgoff %lx file %p private_data %p\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, + vma->vm_prev, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data); + dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); +} +EXPORT_SYMBOL(dump_vma); + +#endif /* CONFIG_DEBUG_VM */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bfb73e025e0..c9710c9bbee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,8 +53,6 @@ #include #include #include -#include -#include #include #include #include @@ -6550,161 +6548,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } #endif - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif -}; - -static void dump_flags(unsigned long flags, - const struct trace_print_flags *names, int count) -{ - const char *delim = ""; - unsigned long mask; - int i; - - printk(KERN_ALERT "flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < count && flags; i++) { - - mask = names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - printk("%s%s", delim, names[i].name); - delim = "|"; - } - - /* check for left over flags */ - if (flags) - printk("%s%#lx", delim, flags); - - printk(")\n"); -} - -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) -{ - printk(KERN_ALERT - "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, atomic_read(&page->_count), page_mapcount(page), - page->mapping, page->index); - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); - if (reason) - pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, - pageflag_names, ARRAY_SIZE(pageflag_names)); - } - mem_cgroup_print_bad_page(page); -} - -void dump_page(struct page *page, const char *reason) -{ - dump_page_badflags(page, reason, 0); -} -EXPORT_SYMBOL(dump_page); - -#ifdef CONFIG_DEBUG_VM - -static const struct trace_print_flags vmaflags_names[] = { - {VM_READ, "read" }, - {VM_WRITE, "write" }, - {VM_EXEC, "exec" }, - {VM_SHARED, "shared" }, - {VM_MAYREAD, "mayread" }, - {VM_MAYWRITE, "maywrite" }, - {VM_MAYEXEC, "mayexec" }, - {VM_MAYSHARE, "mayshare" }, - {VM_GROWSDOWN, "growsdown" }, - {VM_PFNMAP, "pfnmap" }, - {VM_DENYWRITE, "denywrite" }, - {VM_LOCKED, "locked" }, - {VM_IO, "io" }, - {VM_SEQ_READ, "seqread" }, - {VM_RAND_READ, "randread" }, - {VM_DONTCOPY, "dontcopy" }, - {VM_DONTEXPAND, "dontexpand" }, - {VM_ACCOUNT, "account" }, - {VM_NORESERVE, "noreserve" }, - {VM_HUGETLB, "hugetlb" }, - {VM_NONLINEAR, "nonlinear" }, -#if defined(CONFIG_X86) - {VM_PAT, "pat" }, -#elif defined(CONFIG_PPC) - {VM_SAO, "sao" }, -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) - {VM_GROWSUP, "growsup" }, -#elif !defined(CONFIG_MMU) - {VM_MAPPED_COPY, "mappedcopy" }, -#else - {VM_ARCH_1, "arch_1" }, -#endif - {VM_DONTDUMP, "dontdump" }, -#ifdef CONFIG_MEM_SOFT_DIRTY - {VM_SOFTDIRTY, "softdirty" }, -#endif - {VM_MIXEDMAP, "mixedmap" }, - {VM_HUGEPAGE, "hugepage" }, - {VM_NOHUGEPAGE, "nohugepage" }, - {VM_MERGEABLE, "mergeable" }, -}; - -void dump_vma(const struct vm_area_struct *vma) -{ - printk(KERN_ALERT - "vma %p start %p end %p\n" - "next %p prev %p mm %p\n" - "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n", - vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, - vma->vm_prev, vma->vm_mm, - (unsigned long)pgprot_val(vma->vm_page_prot), - vma->anon_vma, vma->vm_ops, vma->vm_pgoff, - vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); -} -EXPORT_SYMBOL(dump_vma); - -#endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3-70-g09d2 From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a55..e8d6e105872 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9e..5cc588c1aba 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18..5340f6b9131 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b638..9cd36b82244 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if -- cgit v1.2.3-70-g09d2 From ad53f92eb416d81e469fa8ea57153e59455e7175 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 13 Nov 2014 15:19:11 -0800 Subject: mm/page_alloc: fix incorrect isolation behavior by rechecking migratetype Before describing bugs itself, I first explain definition of freepage. 1. pages on buddy list are counted as freepage. 2. pages on isolate migratetype buddy list are *not* counted as freepage. 3. pages on cma buddy list are counted as CMA freepage, too. Now, I describe problems and related patch. Patch 1: There is race conditions on getting pageblock migratetype that it results in misplacement of freepages on buddy list, incorrect freepage count and un-availability of freepage. Patch 2: Freepages on pcp list could have stale cached information to determine migratetype of buddy list to go. This causes misplacement of freepages on buddy list and incorrect freepage count. Patch 4: Merging between freepages on different migratetype of pageblocks will cause freepages accouting problem. This patch fixes it. Without patchset [3], above problem doesn't happens on my CMA allocation test, because CMA reserved pages aren't used at all. So there is no chance for above race. With patchset [3], I did simple CMA allocation test and get below result: - Virtual machine, 4 cpus, 1024 MB memory, 256 MB CMA reservation - run kernel build (make -j16) on background - 30 times CMA allocation(8MB * 30 = 240MB) attempts in 5 sec interval - Result: more than 5000 freepage count are missed With patchset [3] and this patchset, I found that no freepage count are missed so that I conclude that problems are solved. On my simple memory offlining test, these problems also occur on that environment, too. This patch (of 4): There are two paths to reach core free function of buddy allocator, __free_one_page(), one is free_one_page()->__free_one_page() and the other is free_hot_cold_page()->free_pcppages_bulk()->__free_one_page(). Each paths has race condition causing serious problems. At first, this patch is focused on first type of freepath. And then, following patch will solve the problem in second type of freepath. In the first type of freepath, we got migratetype of freeing page without holding the zone lock, so it could be racy. There are two cases of this race. 1. pages are added to isolate buddy list after restoring orignal migratetype CPU1 CPU2 get migratetype => return MIGRATE_ISOLATE call free_one_page() with MIGRATE_ISOLATE grab the zone lock unisolate pageblock release the zone lock grab the zone lock call __free_one_page() with MIGRATE_ISOLATE freepage go into isolate buddy list, although pageblock is already unisolated This may cause two problems. One is that we can't use this page anymore until next isolation attempt of this pageblock, because freepage is on isolate buddy list. The other is that freepage accouting could be wrong due to merging between different buddy list. Freepages on isolate buddy list aren't counted as freepage, but ones on normal buddy list are counted as freepage. If merge happens, buddy freepage on normal buddy list is inevitably moved to isolate buddy list without any consideration of freepage accouting so it could be incorrect. 2. pages are added to normal buddy list while pageblock is isolated. It is similar with above case. This also may cause two problems. One is that we can't keep these freepages from being allocated. Although this pageblock is isolated, freepage would be added to normal buddy list so that it could be allocated without any restriction. And the other problem is same as case 1, that it, incorrect freepage accouting. This race condition would be prevented by checking migratetype again with holding the zone lock. Because it is somewhat heavy operation and it isn't needed in common case, we want to avoid rechecking as much as possible. So this patch introduce new variable, nr_isolate_pageblock in struct zone to check if there is isolated pageblock. With this, we can avoid to re-check migratetype in common case and do it only if there is isolated pageblock or migratetype is MIGRATE_ISOLATE. This solve above mentioned problems. Changes from v3: Add one more check in free_one_page() that checks whether migratetype is MIGRATE_ISOLATE or not. Without this, abovementioned case 1 could happens. Signed-off-by: Joonsoo Kim Acked-by: Minchan Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Johannes Weiner Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Tang Chen Cc: Naoya Horiguchi Cc: Bartlomiej Zolnierkiewicz Cc: Wen Congyang Cc: Marek Szyprowski Cc: Laura Abbott Cc: Heesub Shin Cc: "Aneesh Kumar K.V" Cc: Ritesh Harjani Cc: Gioh Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ include/linux/page-isolation.h | 8 ++++++++ mm/page_alloc.c | 11 +++++++++-- mm/page_isolation.c | 2 ++ 4 files changed, 28 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 48bf12ef662..ffe66e381c0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -431,6 +431,15 @@ struct zone { */ int nr_migrate_reserve_block; +#ifdef CONFIG_MEMORY_ISOLATION + /* + * Number of isolated pageblock. It is used to solve incorrect + * freepage counting problem due to racy retrieving migratetype + * of pageblock. Protected by zone->lock. + */ + unsigned long nr_isolate_pageblock; +#endif + #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3fff8e77406..2dc1e1697b4 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -2,6 +2,10 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION +static inline bool has_isolate_pageblock(struct zone *zone) +{ + return zone->nr_isolate_pageblock; +} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -11,6 +15,10 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else +static inline bool has_isolate_pageblock(struct zone *zone) +{ + return false; +} static inline bool is_migrate_isolate_page(struct page *page) { return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9cd36b82244..df1da25b309 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -739,9 +739,16 @@ static void free_one_page(struct zone *zone, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + if (is_migrate_isolate(migratetype)) + goto skip_counting; + } + __mod_zone_freepage_state(zone, 1 << order, migratetype); + +skip_counting: __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d1473b2e948..1fa4a4d72ec 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -60,6 +60,7 @@ out: int migratetype = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -83,6 +84,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); } -- cgit v1.2.3-70-g09d2 From 51bb1a4093cc68bc16b282548d9cee6104be0ef1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 13 Nov 2014 15:19:14 -0800 Subject: mm/page_alloc: add freepage on isolate pageblock to correct buddy list In free_pcppages_bulk(), we use cached migratetype of freepage to determine type of buddy list where freepage will be added. This information is stored when freepage is added to pcp list, so if isolation of pageblock of this freepage begins after storing, this cached information could be stale. In other words, it has original migratetype rather than MIGRATE_ISOLATE. There are two problems caused by this stale information. One is that we can't keep these freepages from being allocated. Although this pageblock is isolated, freepage will be added to normal buddy list so that it could be allocated without any restriction. And the other problem is incorrect freepage accounting. Freepages on isolate pageblock should not be counted for number of freepage. Following is the code snippet in free_pcppages_bulk(). /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); if (is_migrate_cma(mt)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); } As you can see above snippet, current code already handle second problem, incorrect freepage accounting, by re-fetching pageblock migratetype through is_migrate_isolate_page(page). But, because this re-fetched information isn't used for __free_one_page(), first problem would not be solved. This patch try to solve this situation to re-fetch pageblock migratetype before __free_one_page() and to use it for __free_one_page(). In addition to move up position of this re-fetch, this patch use optimization technique, re-fetching migratetype only if there is isolate pageblock. Pageblock isolation is rare event, so we can avoid re-fetching in common case with this optimization. This patch also correct migratetype of the tracepoint output. Signed-off-by: Joonsoo Kim Acked-by: Minchan Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Johannes Weiner Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Tang Chen Cc: Naoya Horiguchi Cc: Bartlomiej Zolnierkiewicz Cc: Wen Congyang Cc: Marek Szyprowski Cc: Laura Abbott Cc: Heesub Shin Cc: "Aneesh Kumar K.V" Cc: Ritesh Harjani Cc: Gioh Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df1da25b309..58923bea0d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -715,14 +715,17 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) { + mt = get_pageblock_migratetype(page); + if (is_migrate_isolate(mt)) + goto skip_counting; + } + __mod_zone_freepage_state(zone, 1, mt); + +skip_counting: /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(!is_migrate_isolate_page(page))) { - __mod_zone_page_state(zone, NR_FREE_PAGES, 1); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); - } } while (--to_free && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); -- cgit v1.2.3-70-g09d2 From 8f82b55dd558a74fc33d69a1f2c2605d0cd2c908 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 13 Nov 2014 15:19:18 -0800 Subject: mm/page_alloc: move freepage counting logic to __free_one_page() All the caller of __free_one_page() has similar freepage counting logic, so we can move it to __free_one_page(). This reduce line of code and help future maintenance. This is also preparation step for "mm/page_alloc: restrict max order of merging on isolated pageblock" which fix the freepage counting problem on freepage with more than pageblock order. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Tang Chen Cc: Naoya Horiguchi Cc: Bartlomiej Zolnierkiewicz Cc: Wen Congyang Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Laura Abbott Cc: Heesub Shin Cc: "Aneesh Kumar K.V" Cc: Ritesh Harjani Cc: Gioh Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58923bea0d8..9f689f16b5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -577,6 +577,8 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, 1 << order, migratetype); page_idx = pfn & ((1 << MAX_ORDER) - 1); @@ -715,14 +717,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); - if (unlikely(has_isolate_pageblock(zone))) { + if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); - if (is_migrate_isolate(mt)) - goto skip_counting; - } - __mod_zone_freepage_state(zone, 1, mt); -skip_counting: /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); @@ -745,12 +742,7 @@ static void free_one_page(struct zone *zone, if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); - if (is_migrate_isolate(migratetype)) - goto skip_counting; } - __mod_zone_freepage_state(zone, 1 << order, migratetype); - -skip_counting: __free_one_page(page, pfn, zone, order, migratetype); spin_unlock(&zone->lock); } -- cgit v1.2.3-70-g09d2 From 3c605096d3158216ba9326a16266f6ba128c2c8d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 13 Nov 2014 15:19:21 -0800 Subject: mm/page_alloc: restrict max order of merging on isolated pageblock Current pageblock isolation logic could isolate each pageblock individually. This causes freepage accounting problem if freepage with pageblock order on isolate pageblock is merged with other freepage on normal pageblock. We can prevent merging by restricting max order of merging to pageblock order if freepage is on isolate pageblock. A side-effect of this change is that there could be non-merged buddy freepage even if finishing pageblock isolation, because undoing pageblock isolation is just to move freepage from isolate buddy list to normal buddy list rather than to consider merging. So, the patch also makes undoing pageblock isolation consider freepage merge. When un-isolation, freepage with more than pageblock order and it's buddy are checked. If they are on normal pageblock, instead of just moving, we isolate the freepage and free it in order to get merged. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Tang Chen Cc: Naoya Horiguchi Cc: Bartlomiej Zolnierkiewicz Cc: Wen Congyang Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Laura Abbott Cc: Heesub Shin Cc: "Aneesh Kumar K.V" Cc: Ritesh Harjani Cc: Gioh Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 25 +++++++++++++++++++++++++ mm/page_alloc.c | 41 ++++++++++++++--------------------------- mm/page_isolation.c | 41 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 78 insertions(+), 29 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 829304090b9..a4f90ba7068 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_index(unsigned long page_idx, unsigned int order) +{ + return page_idx ^ (1 << order); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f689f16b5a..fd11b913779 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -466,29 +466,6 @@ static inline void rmv_page_order(struct page *page) set_page_private(page, 0); } -/* - * Locate the struct page for both the matching buddy in our - * pair (buddy1) and the combined O(n+1) page they form (page). - * - * 1) Any buddy B1 will have an order O twin B2 which satisfies - * the following equation: - * B2 = B1 ^ (1 << O) - * For example, if the starting buddy (buddy2) is #8 its order - * 1 buddy is #10: - * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 - * - * 2) Any buddy B will have an order O+1 parent P which - * satisfies the following equation: - * P = B & ~(1 << O) - * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER - */ -static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) -{ - return page_idx ^ (1 << order); -} - /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if @@ -569,6 +546,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; + int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); @@ -577,15 +555,24 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); - if (!is_migrate_isolate(migratetype)) + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - page_idx = pfn & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << max_order) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER-1) { + while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) @@ -1486,7 +1473,7 @@ void split_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(split_page); -static int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order) { unsigned long watermark; struct zone *zone; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 1fa4a4d72ec..c8778f7e208 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -76,17 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; + struct page *isolated_page = NULL; + unsigned int order; + unsigned long page_idx, buddy_idx; + struct page *buddy; zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_freepage_state(zone, nr_pages, migratetype); + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = page_order(page); + if (order >= pageblock_order) { + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + + if (!is_migrate_isolate_page(buddy)) { + __isolate_free_page(page, order); + set_page_refcounted(page); + isolated_page = page; + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } set_pageblock_migratetype(page, migratetype); zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); + if (isolated_page) + __free_pages(isolated_page, order); } static inline struct page * -- cgit v1.2.3-70-g09d2 From dae803e165a11bc88ca8dbc07a11077caf97bbcb Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Thu, 13 Nov 2014 15:19:27 -0800 Subject: mm: alloc_contig_range: demote pages busy message from warn to info Having test_pages_isolated failure message as a warning confuses users into thinking that it is more serious than it really is. In reality, if called via CMA, allocation will be retried so a single test_pages_isolated failure does not prevent allocation from succeeding. Demote the warning message to an info message and reformat it such that the text "failed" does not appear and instead a less worrying "PFNS busy" is used. This message is trivially reproducible on a 10GB x86 machine on 3.16.y kernels configured with CONFIG_DMA_CMA. Signed-off-by: Michal Nazarewicz Cc: Laurent Pinchart Cc: Peter Hurley Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd11b913779..181dc593962 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6397,13 +6397,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", - outer_start, end); + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); ret = -EBUSY; goto done; } - /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { -- cgit v1.2.3-70-g09d2 From 57cbc87e03c2f473d8f0579186c078ee06f48f2c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 13 Nov 2014 15:19:36 -0800 Subject: mm/debug-pagealloc: correct freepage accounting and order resetting One thing I did in this patch is fixing freepage accounting. If we clear guard page and link it onto isolate buddy list, we should not increase freepage count. This patch adds conditional branch to skip counting in this case. Without this patch, this overcounting happens frequently if guard order is set and CMA is used. Another thing fixed in this patch is the target to reset order. In __free_one_page(), we check the buddy page whether it is a guard page or not. And, if so, we should clear guard attribute on the buddy page and reset order of it to 0. But, current code resets original page's order rather than buddy one's. Maybe, this doesn't have any problem, because whole merged page's order will be re-assigned soon. But, it is better to correct code. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Gioh Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 181dc593962..616a2c956b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -583,9 +583,11 @@ static inline void __free_one_page(struct page *page, */ if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); - set_page_private(page, 0); - __mod_zone_freepage_state(zone, 1 << order, - migratetype); + set_page_private(buddy, 0); + if (!is_migrate_isolate(migratetype)) { + __mod_zone_freepage_state(zone, 1 << order, + migratetype); + } } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; -- cgit v1.2.3-70-g09d2