From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 56 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c..7cfdcd808f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } -- cgit v1.2.3-70-g09d2 From 4f9b16a64753d0bb607454347036dc997fd03b82 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:14 -0700 Subject: mm: disable zone_reclaim_mode by default When it was introduced, zone_reclaim_mode made sense as NUMA distances punished and workloads were generally partitioned to fit into a NUMA node. NUMA machines are now common but few of the workloads are NUMA-aware and it's routine to see major performance degradation due to zone_reclaim_mode being enabled but relatively few can identify the problem. Those that require zone_reclaim_mode are likely to be able to detect when it needs to be enabled and tune appropriately so lets have a sensible default for the bulk of users. This patch (of 2): zone_reclaim_mode causes processes to prefer reclaiming memory from local node instead of spilling over to other nodes. This made sense initially when NUMA machines were almost exclusively HPC and the workload was partitioned into nodes. The NUMA penalties were sufficiently high to justify reclaiming the memory. On current machines and workloads it is often the case that zone_reclaim_mode destroys performance but not all users know how to detect this. Favour the common case and disable it by default. Users that are sophisticated enough to know they need zone_reclaim_mode will detect it. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 17 +++++++++-------- arch/ia64/include/asm/topology.h | 3 ++- arch/powerpc/include/asm/topology.h | 8 ++------ include/linux/topology.h | 3 ++- mm/page_alloc.c | 2 -- 5 files changed, 15 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dd9d0e33b44..5b6da0fb5fb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -772,16 +772,17 @@ This is value ORed together of 2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages -zone_reclaim_mode is set during bootup to 1 if it is determined that pages -from remote zones will cause a measurable performance reduction. The -page allocator will then reclaim easily reusable pages (those page -cache pages that are currently not used) before allocating off node pages. - -It may be beneficial to switch off zone reclaim if the system is -used for a file server and all of memory should be used for caching files -from disk. In that case the caching effect is more important than +zone_reclaim_mode is disabled by default. For file servers or workloads +that benefit from having their data cached, zone_reclaim_mode should be +left disabled as the caching effect is likely to be more important than data locality. +zone_reclaim may be enabled if it's known that the workload is partitioned +such that each partition fits within a NUMA node and that accessing remote +memory would cause a measurable performance reduction. The page allocator +will then reclaim easily reusable pages (those page cache pages that are +currently not used) before allocating off node pages. + Allowing zone reclaim to write out pages stops processes that are writing large amounts of data from dirtying pages on other nodes. Zone reclaim will write out dirty pages if a zone fills up and so effectively diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 3202aa74e0d..6437ca21f61 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -21,7 +21,8 @@ #define PENALTY_FOR_NODE_WITH_CPUS 255 /* - * Distance above which we begin to use zone reclaim + * Nodes within this distance are eligible for reclaim by zone_reclaim() when + * zone_reclaim_mode is enabled. */ #define RECLAIM_DISTANCE 15 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c9202151079..6c8a8c5a37a 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -9,12 +9,8 @@ struct device_node; #ifdef CONFIG_NUMA /* - * Before going off node we want the VM to try and reclaim from the local - * node. It does this if the remote distance is larger than RECLAIM_DISTANCE. - * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of - * 20, we never reclaim and go off node straight away. - * - * To fix this we choose a smaller value of RECLAIM_DISTANCE. + * If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that + * all zones on all nodes will be eligible for zone_reclaim(). */ #define RECLAIM_DISTANCE 10 diff --git a/include/linux/topology.h b/include/linux/topology.h index 973671ff9e7..dda6ee521e7 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -58,7 +58,8 @@ int arch_update_cpu_topology(void); /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) - * then switch on zone reclaim on boot. + * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() + * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cfdcd808f5..dfe954fbb48 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1860,8 +1860,6 @@ static void __paginginit init_zone_allows_reclaim(int nid) for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; } #else /* CONFIG_NUMA */ -- cgit v1.2.3-70-g09d2 From 5f7a75acdb24c7b9c436b3a0a66eec12e101d19c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:15 -0700 Subject: mm: page_alloc: do not cache reclaim distances pgdat->reclaim_nodes tracks if a remote node is allowed to be reclaimed by zone_reclaim due to its distance. As it is expected that zone_reclaim_mode will be rarely enabled it is unreasonable for all machines to take a penalty. Fortunately, the zone_reclaim_mode() path is already slow and it is the path that takes the hit. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - mm/page_alloc.c | 17 ++--------------- 2 files changed, 2 insertions(+), 16 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fac5509c18f..c1dbe0ba9f8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -763,7 +763,6 @@ typedef struct pglist_data { unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */ wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dfe954fbb48..9f13bcfb676 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1850,16 +1850,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ - int i; - - for_each_node_state(i, N_MEMORY) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) - node_set(i, NODE_DATA(nid)->reclaim_nodes); + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -1893,9 +1885,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return true; } -static inline void init_zone_allows_reclaim(int nid) -{ -} #endif /* CONFIG_NUMA */ /* @@ -4933,8 +4922,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - if (node_state(nid, N_MEMORY)) - init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif -- cgit v1.2.3-70-g09d2 From 5bcc9f86ef09a933255ee66bd899d4601785dad5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:07:22 -0700 Subject: mm/page_alloc: prevent MIGRATE_RESERVE pages from being misplaced For the MIGRATE_RESERVE pages, it is useful when they do not get misplaced on free_list of other migratetype, otherwise they might get allocated prematurely and e.g. fragment the MIGRATE_RESEVE pageblocks. While this cannot be avoided completely when allocating new MIGRATE_RESERVE pageblocks in min_free_kbytes sysctl handler, we should prevent the misplacement where possible. Currently, it is possible for the misplacement to happen when a MIGRATE_RESERVE page is allocated on pcplist through rmqueue_bulk() as a fallback for other desired migratetype, and then later freed back through free_pcppages_bulk() without being actually used. This happens because free_pcppages_bulk() uses get_freepage_migratetype() to choose the free_list, and rmqueue_bulk() calls set_freepage_migratetype() with the *desired* migratetype and not the page's original MIGRATE_RESERVE migratetype. This patch fixes the problem by moving the call to set_freepage_migratetype() from rmqueue_bulk() down to __rmqueue_smallest() and __rmqueue_fallback() where the actual page's migratetype (e.g. from which free_list the page is taken from) is used. Note that this migratetype might be different from the pageblock's migratetype due to freepage stealing decisions. This is OK, as page stealing never uses MIGRATE_RESERVE as a fallback, and also takes care to leave all MIGRATE_CMA pages on the correct freelist. Therefore, as an additional benefit, the call to get_pageblock_migratetype() from rmqueue_bulk() when CMA is enabled, can be removed completely. This relies on the fact that MIGRATE_CMA pageblocks are created only during system init, and the above. The related is_migrate_isolate() check is also unnecessary, as memory isolation has other ways to move pages between freelists, and drain pcp lists containing pages that should be isolated. The buffered_rmqueue() can also benefit from calling get_freepage_migratetype() instead of get_pageblock_migratetype(). Signed-off-by: Vlastimil Babka Reported-by: Yong-Taek Lee Reported-by: Bartlomiej Zolnierkiewicz Suggested-by: Joonsoo Kim Acked-by: Joonsoo Kim Suggested-by: Mel Gorman Acked-by: Minchan Kim Cc: KOSAKI Motohiro Cc: Marek Szyprowski Cc: Hugh Dickins Cc: Rik van Riel Cc: Michal Nazarewicz Cc: "Wang, Yalin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f13bcfb676..ab46f794509 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -931,6 +931,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1057,7 +1058,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1125,6 +1128,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1175,7 +1184,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1196,14 +1205,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1572,7 +1575,7 @@ again: if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); -- cgit v1.2.3-70-g09d2 From 613813e8985bb76bd27937bfa54faf9e9be95a52 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:07:27 -0700 Subject: mm: debug: make bad_range() output more usable and readable Nobody outputs memory addresses in decimal. PFNs are essentially addresses, and they're gibberish in decimal. Output them in hex. Also, add the nid and zone name to give a little more context to the message. Signed-off-by: Dave Hansen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab46f794509..132c337dbe5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) } while (zone_span_seqretry(zone, seq)); if (ret) - pr_err("page %lu outside zone [ %lu - %lu ]\n", - pfn, start_pfn, start_pfn + sp); + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); return ret; } -- cgit v1.2.3-70-g09d2 From 68711a746345c44ae00c64d8dbac6a9ce13ac54a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:25 -0700 Subject: mm, migration: add destination page freeing callback Memory migration uses a callback defined by the caller to determine how to allocate destination pages. When migration fails for a source page, however, it frees the destination page back to the system. This patch adds a memory migration callback defined by the caller to determine how to free destination pages. If a caller, such as memory compaction, builds its own freelist for migration targets, this can reuse already freed memory instead of scanning additional memory. If the caller provides a function to handle freeing of destination pages, it is called when page migration fails. If the caller passes NULL then freeing back to the system will be handled as usual. This patch introduces no functional change. Signed-off-by: David Rientjes Reviewed-by: Naoya Horiguchi Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 11 ++++++---- mm/compaction.c | 2 +- mm/memory-failure.c | 4 ++-- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/migrate.c | 55 +++++++++++++++++++++++++++++++++++-------------- mm/page_alloc.c | 2 +- 7 files changed, 53 insertions(+), 27 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 84a31ad0b79..a2901c41466 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -5,7 +5,9 @@ #include #include -typedef struct page *new_page_t(struct page *, unsigned long private, int **); +typedef struct page *new_page_t(struct page *page, unsigned long private, + int **reason); +typedef void free_page_t(struct page *page, unsigned long private); /* * Return values from addresss_space_operations.migratepage(): @@ -38,7 +40,7 @@ enum migrate_reason { extern void putback_movable_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -extern int migrate_pages(struct list_head *l, new_page_t x, +extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern int migrate_prep(void); @@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping, #else static inline void putback_movable_pages(struct list_head *l) {} -static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, enum migrate_mode mode, int reason) +static inline int migrate_pages(struct list_head *l, new_page_t new, + free_page_t free, unsigned long private, enum migrate_mode mode, + int reason) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 6010aabde28..f74a362d2e2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1016,7 +1016,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } nr_migrate = cc->nr_migratepages; - err = migrate_pages(&cc->migratepages, compaction_alloc, + err = migrate_pages(&cc->migratepages, compaction_alloc, NULL, (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d50f17fb9be..3cd1b652821 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1503,7 +1503,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1584,7 +1584,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { if (!list_empty(&pagelist)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cbb7ca0ac44..469bbf505f8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1394,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7f7864b95e8..16bc9fa4299 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, + NULL, (unsigned long)vma, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 6247be7fa30..2a459675eea 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -938,8 +938,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -983,11 +984,17 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(newpage, private); + else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -1016,8 +1023,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1086,6 +1104,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1099,7 +1119,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1273,7 +1295,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 132c337dbe5..027d0294413 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6218,7 +6218,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, MIGRATE_SYNC, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); -- cgit v1.2.3-70-g09d2 From e0b9daeb453e602a95ea43853dc12d385558ce1f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:28 -0700 Subject: mm, compaction: embed migration mode in compact_control We're going to want to manipulate the migration mode for compaction in the page allocator, and currently compact_control's sync field is only a bool. Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending on the value of this bool. Convert the bool to enum migrate_mode and pass the migration mode in directly. Later, we'll want to avoid MIGRATE_SYNC_LIGHT for thp allocations in the pagefault patch to avoid unnecessary latency. This also alters compaction triggered from sysfs, either for the entire system or for a node, to force MIGRATE_SYNC. [akpm@linux-foundation.org: fix build] [iamjoonsoo.kim@lge.com: use MIGRATE_SYNC in alloc_contig_range()] Signed-off-by: David Rientjes Suggested-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Greg Thelen Cc: Naoya Horiguchi Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 ++-- mm/compaction.c | 36 +++++++++++++++++++----------------- mm/internal.h | 2 +- mm/page_alloc.c | 39 +++++++++++++++++---------------------- 4 files changed, 39 insertions(+), 42 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7e1c76e3cd6..01e3132820d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync, bool *contended); + enum migrate_mode mode, bool *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 70c0f8cda33..217a6ad9a20 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc, return; if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; - if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1]) + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) zone->compact_cached_migrate_pfn[1] = pfn; } else { if (cc->finished_update_free) @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -473,7 +474,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, bool locked = false; struct page *page = NULL, *valid_page = NULL; bool set_unsuitable = true; - const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | (unevictable ? ISOLATE_UNEVICTABLE : 0); /* @@ -483,7 +485,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -548,7 +550,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * the minimum amount of work satisfies the allocation */ mt = get_pageblock_migratetype(page); - if (!cc->sync && !migrate_async_suitable(mt)) { + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { set_unsuitable = false; goto next_pageblock; } @@ -981,6 +984,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -1006,7 +1010,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync]; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -1040,8 +1044,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - compaction_free, (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -1074,9 +1077,8 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1085,7 +1087,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1107,7 +1109,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1115,7 +1117,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1140,7 +1142,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1190,7 +1192,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1203,7 +1205,7 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; diff --git a/mm/internal.h b/mm/internal.h index 6ee580d69dd..a25424a24e0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,7 +134,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027d0294413..afb29da0576 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2217,7 +2217,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2231,7 +2231,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2264,7 +2264,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2277,9 +2277,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int migratetype, enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2474,7 +2473,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2568,17 +2567,15 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because @@ -2653,12 +2650,10 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -6218,7 +6213,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6257,7 +6252,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v1.2.3-70-g09d2 From 75f30861a12a6b09b759dfeeb9290b681af89057 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:30 -0700 Subject: mm, thp: avoid excessive compaction latency during fault Synchronous memory compaction can be very expensive: it can iterate an enormous amount of memory without aborting, constantly rescheduling, waiting on page locks and lru_lock, etc, if a pageblock cannot be defragmented. Unfortunately, it's too expensive for transparent hugepage page faults and it's much better to simply fallback to pages. On 128GB machines, we find that synchronous memory compaction can take O(seconds) for a single thp fault. Now that async compaction remembers where it left off without strictly relying on sync compaction, this makes thp allocations best-effort without causing egregious latency during fault. We still need to retry async compaction after reclaim, but this won't stall for seconds. Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Greg Thelen Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afb29da0576..d88d6758476 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2575,7 +2575,14 @@ rebalance: &did_some_progress); if (page) goto got_pg; - migration_mode = MIGRATE_SYNC_LIGHT; + + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because -- cgit v1.2.3-70-g09d2 From 65bb371984d6a2c909244eb749e482bb40b72e36 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:05 -0700 Subject: mm: page_alloc: do not update zlc unless the zlc is active The zlc is used on NUMA machines to quickly skip over zones that are full. However it is always updated, even for the first zone scanned when the zlc might not even be active. As it's a write to a bitmap that potentially bounces cache line it's deceptively expensive and most machines will not care. Only update the zlc if it was active. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d88d6758476..8e766241cf5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2035,7 +2035,7 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } -- cgit v1.2.3-70-g09d2 From 800a1e750c7b04c2aa2459afca77e936e01c0029 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:06 -0700 Subject: mm: page_alloc: do not treat a zone that cannot be used for dirty pages as "full" If a zone cannot be used for a dirty page then it gets marked "full" which is cached in the zlc and later potentially skipped by allocation requests that have nothing to do with dirty zones. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e766241cf5..b4381eaee71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1967,7 +1967,7 @@ zonelist_scan: */ if ((alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, -- cgit v1.2.3-70-g09d2 From 664eeddeef6539247691197c1ac124d4aa872ab6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:08 -0700 Subject: mm: page_alloc: use jump labels to avoid checking number_of_cpusets If cpusets are not in use then we still check a global variable on every page allocation. Use jump labels to avoid the overhead. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 29 ++++++++++++++++++++++++++--- kernel/cpuset.c | 14 ++++---------- mm/page_alloc.c | 3 ++- 3 files changed, 32 insertions(+), 14 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index b19d3dc2e65..ade2390ffe9 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -12,10 +12,31 @@ #include #include #include +#include #ifdef CONFIG_CPUSETS -extern int number_of_cpusets; /* How many cpusets are defined in system? */ +extern struct static_key cpusets_enabled_key; +static inline bool cpusets_enabled(void) +{ + return static_key_false(&cpusets_enabled_key); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key) + 1; +} + +static inline void cpuset_inc(void) +{ + static_key_slow_inc(&cpusets_enabled_key); +} + +static inline void cpuset_dec(void) +{ + static_key_slow_dec(&cpusets_enabled_key); +} extern int cpuset_init(void); extern void cpuset_init_smp(void); @@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_softwall(node, gfp_mask); } static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_hardwall(node, gfp_mask); } @@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) #else /* !CONFIG_CPUSETS */ +static inline bool cpusets_enabled(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd0..13001784389 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@ #include #include -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); - number_of_cpusets++; + cpuset_inc(); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - number_of_cpusets--; + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); @@ -1992,7 +1987,6 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) BUG(); - number_of_cpusets = 1; return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4381eaee71..a2955e10171 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1921,7 +1921,8 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -- cgit v1.2.3-70-g09d2 From d34c5fa06fade08a689fc171bf756fba2858ae73 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:10 -0700 Subject: mm: page_alloc: only check the zone id check if pages are buddies A node/zone index is used to check if pages are compatible for merging but this happens unconditionally even if the buddy page is not free. Defer the calculation as long as possible. Ideally we would check the zone boundary but nodes can overlap. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2955e10171..da526905b4a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -509,16 +509,26 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; -- cgit v1.2.3-70-g09d2 From a6e21b14f22041382e832d30deda6f26f37b1097 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:12 -0700 Subject: mm: page_alloc: only check the alloc flags and gfp_mask for dirty once Currently it's calculated once per zone in the zonelist. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da526905b4a..30f327a720f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1917,6 +1917,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); classzone_idx = zone_idx(preferred_zone); zonelist_scan: @@ -1976,8 +1978,7 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + if (consider_zone_dirty && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; -- cgit v1.2.3-70-g09d2 From 5dab29113ca56335c78be3f98bf5ddf2ef8eb6a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:14 -0700 Subject: mm: page_alloc: take the ALLOC_NO_WATERMARK check out of the fast path ALLOC_NO_WATERMARK is set in a few cases. Always by kswapd, always for __GFP_MEMALLOC, sometimes for swap-over-nfs, tasks etc. Each of these cases are relatively rare events but the ALLOC_NO_WATERMARK check is an unlikely branch in the fast path. This patch moves the check out of the fast path and after it has been determined that the watermarks have not been met. This helps the common fast path at the cost of making the slow path slower and hitting kswapd with a performance cost. It's a reasonable tradeoff. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30f327a720f..485932c577e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1937,9 +1937,6 @@ zonelist_scan: (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a @@ -1986,6 +1983,11 @@ zonelist_scan: classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* -- cgit v1.2.3-70-g09d2 From e58469bafd0524e848c3733bc3918d854595e20f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:16 -0700 Subject: mm: page_alloc: use word-based accesses for get/set pageblock bitmaps The test_bit operations in get/set pageblock flags are expensive. This patch reads the bitmap on a word basis and use shifts and masks to isolate the bits of interest. Similarly masks are used to set a local copy of the bitmap and then use cmpxchg to update the bitmap if there have been no other changes made in parallel. In a test running dd onto tmpfs the overhead of the pageblock-related functions went from 1.27% in profiles to 0.5%. In addition to the performance benefits, this patch closes races that are possible between: a) get_ and set_pageblock_migratetype(), where get_pageblock_migratetype() reads part of the bits before and other part of the bits after set_pageblock_migratetype() has updated them. b) set_pageblock_migratetype() and set_pageblock_skip(), where the non-atomic read-modify-update set bit operation in set_pageblock_skip() will cause lost updates to some bits changed in the set_pageblock_migratetype(). Joonsoo Kim first reported the case a) via code inspection. Vlastimil Babka's testing with a debug patch showed that either a) or b) occurs roughly once per mmtests' stress-highalloc benchmark (although not necessarily in the same pageblock). Furthermore during development of unrelated compaction patches, it was observed that frequent calls to {start,undo}_isolate_page_range() the race occurs several thousands of times and has resulted in NULL pointer dereferences in move_freepages() and free_one_page() in places where free_list[migratetype] is manipulated by e.g. list_move(). Further debugging confirmed that migratetype had invalid value of 6, causing out of bounds access to the free_list array. That confirmed that the race exist, although it may be extremely rare, and currently only fatal where page isolation is performed due to memory hot remove. Races on pageblocks being updated by set_pageblock_migratetype(), where both old and new migratetype are lower MIGRATE_RESERVE, currently cannot result in an invalid value being observed, although theoretically they may still lead to unexpected creation or destruction of MIGRATE_RESERVE pageblocks. Furthermore, things could get suddenly worse when memory isolation is used more, or when new migratetypes are added. After this patch, the race has no longer been observed in testing. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Reported-by: Joonsoo Kim Reported-and-tested-by: Vlastimil Babka Cc: Johannes Weiner Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++- include/linux/pageblock-flags.h | 37 ++++++++++++++++++++++++----- mm/page_alloc.c | 52 +++++++++++++++++++++++++---------------- 3 files changed, 68 insertions(+), 27 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 10a96ee6831..8ef1e3f71e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -75,9 +75,13 @@ enum { extern int page_group_by_mobility_disabled; +#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) +#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) + static inline int get_pageblock_migratetype(struct page *page) { - return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); + BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); + return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); } struct free_area { diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 2ee8cd2466b..c08730c10c7 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -30,9 +30,12 @@ enum pageblock_bits { PB_migrate, PB_migrate_end = PB_migrate + 3 - 1, /* 3 bits required for migrate types */ -#ifdef CONFIG_COMPACTION PB_migrate_skip,/* If set the block is skipped by compaction */ -#endif /* CONFIG_COMPACTION */ + + /* + * Assume the bits will always align on a word. If this assumption + * changes then get/set pageblock needs updating. + */ NR_PAGEBLOCK_BITS }; @@ -62,11 +65,33 @@ extern int pageblock_order; /* Forward declaration */ struct page; +unsigned long get_pageblock_flags_mask(struct page *page, + unsigned long end_bitidx, + unsigned long mask); +void set_pageblock_flags_mask(struct page *page, + unsigned long flags, + unsigned long end_bitidx, + unsigned long mask); + /* Declarations for getting and setting flags. See mm/page_alloc.c */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx); -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx); +static inline unsigned long get_pageblock_flags_group(struct page *page, + int start_bitidx, int end_bitidx) +{ + unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; + unsigned long mask = (1 << nr_flag_bits) - 1; + + return get_pageblock_flags_mask(page, end_bitidx, mask); +} + +static inline void set_pageblock_flags_group(struct page *page, + unsigned long flags, + int start_bitidx, int end_bitidx) +{ + unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; + unsigned long mask = (1 << nr_flag_bits) - 1; + + set_pageblock_flags_mask(page, flags, end_bitidx, mask); +} #ifdef CONFIG_COMPACTION #define get_pageblock_skip(page) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 485932c577e..6e937809c87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6028,53 +6028,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pageblock_flags_mask(struct page *page, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pageblock_flags_mask(struct page *page, unsigned long flags, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* -- cgit v1.2.3-70-g09d2 From dc4b0caff24d9b2918e9f27bc65499ee63187eba Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:17 -0700 Subject: mm: page_alloc: reduce number of times page_to_pfn is called In the free path we calculate page_to_pfn multiple times. Reduce that. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++-- include/linux/pageblock-flags.h | 33 +++++++++++++-------------------- mm/page_alloc.c | 34 +++++++++++++++++++--------------- 3 files changed, 39 insertions(+), 37 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8ef1e3f71e0..472426ac96a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled; #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) -static inline int get_pageblock_migratetype(struct page *page) +#define get_pageblock_migratetype(page) \ + get_pfnblock_flags_mask(page, page_to_pfn(page), \ + PB_migrate_end, MIGRATETYPE_MASK) + +static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) { BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); - return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); + return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, + MIGRATETYPE_MASK); } struct free_area { diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index c08730c10c7..2baeee12f48 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -65,33 +65,26 @@ extern int pageblock_order; /* Forward declaration */ struct page; -unsigned long get_pageblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask); -void set_pageblock_flags_mask(struct page *page, + +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask); /* Declarations for getting and setting flags. See mm/page_alloc.c */ -static inline unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) -{ - unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; - unsigned long mask = (1 << nr_flag_bits) - 1; - - return get_pageblock_flags_mask(page, end_bitidx, mask); -} - -static inline void set_pageblock_flags_group(struct page *page, - unsigned long flags, - int start_bitidx, int end_bitidx) -{ - unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; - unsigned long mask = (1 << nr_flag_bits) - 1; - - set_pageblock_flags_mask(page, flags, end_bitidx, mask); -} +#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \ + get_pfnblock_flags_mask(page, page_to_pfn(page), \ + end_bitidx, \ + (1 << (end_bitidx - start_bitidx + 1)) - 1) +#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \ + set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \ + end_bitidx, \ + (1 << (end_bitidx - start_bitidx + 1)) - 1) #ifdef CONFIG_COMPACTION #define get_pageblock_skip(page) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e937809c87..6cadc8678e2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -560,6 +560,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -576,7 +577,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -711,7 +712,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -723,13 +724,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + int order, int migratetype) { spin_lock(&zone->lock); zone->pages_scanned = 0; - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); @@ -766,15 +769,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -1380,12 +1384,13 @@ void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1399,7 +1404,7 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; @@ -6028,17 +6033,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; @@ -6050,25 +6054,25 @@ unsigned long get_pageblock_flags_mask(struct page *page, } /** - * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_mask(struct page *page, unsigned long flags, +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; -- cgit v1.2.3-70-g09d2 From cfc47a2803db42140167b92d991ef04018e162c7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:19 -0700 Subject: mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free get_pageblock_migratetype() is called during free with IRQs disabled. This is unnecessary and disables IRQs for longer than necessary. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cadc8678e2..ce4d3716214 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -774,9 +774,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); -- cgit v1.2.3-70-g09d2 From 7aeb09f9104b760fc53c98cb7d20d06640baf9e6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:21 -0700 Subject: mm: page_alloc: use unsigned int for order in more places X86 prefers the use of unsigned types for iterators and there is a tendency to mix whether a signed or unsigned type if used for page order. This converts a number of sites in mm/page_alloc.c to use unsigned int for order where possible. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++---- mm/page_alloc.c | 43 +++++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 24 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 472426ac96a..6cbd1b6c3d2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -817,10 +817,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags); -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags); +bool zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce4d3716214..37ef1b87f1f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -409,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -453,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -504,7 +505,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; @@ -726,7 +727,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, - int order, + unsigned int order, int migratetype) { spin_lock(&zone->lock); @@ -897,7 +898,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -1108,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1345,7 +1347,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1541,8 +1543,8 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; @@ -1691,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; @@ -1726,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -4121,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -6444,7 +6447,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6496,7 +6499,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { -- cgit v1.2.3-70-g09d2 From b745bc85f21ea707e4ea1a91948055fa3e72c77b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:22 -0700 Subject: mm: page_alloc: convert hot/cold parameter and immediate callers to bool cold is a bool, make it one. Make the likely case the "if" part of the block instead of the else as according to the optimisation manual this is preferred. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/mm/homecache.c | 2 +- fs/fuse/dev.c | 2 +- include/linux/gfp.h | 4 ++-- include/linux/pagemap.h | 2 +- include/linux/swap.h | 2 +- mm/page_alloc.c | 20 ++++++++++---------- mm/swap.c | 4 ++-- mm/swap_state.c | 2 +- mm/vmscan.c | 6 +++--- 9 files changed, 22 insertions(+), 22 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index 004ba568d93..33294fdc402 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) if (put_page_testzero(page)) { homecache_change_page_home(page, order, PAGE_HOME_HASH); if (order == 0) { - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } else { init_page_count(page); __free_pages(page, order); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aac71ce373e..098f97bdcf1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1614,7 +1614,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, 0); + release_pages(req->pages, req->num_pages, false); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d382db71e30..454c99fdb79 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -371,8 +371,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_hot_cold_page(struct page *page, int cold); -extern void free_hot_cold_page_list(struct list_head *list, int cold); +extern void free_hot_cold_page(struct page *page, bool cold); +extern void free_hot_cold_page_list(struct list_head *list, bool cold); extern void __free_kmem_pages(struct page *page, unsigned int order); extern void free_kmem_pages(unsigned long addr, unsigned int order); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 718214c5584..c16fb6d06e3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -110,7 +110,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) #define page_cache_get(page) get_page(page) #define page_cache_release(page) put_page(page) -void release_pages(struct page **pages, int nr, int cold); +void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. diff --git a/include/linux/swap.h b/include/linux/swap.h index 9155bcdcce1..97cf16164c4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -477,7 +477,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define free_page_and_swap_cache(page) \ page_cache_release(page) #define free_pages_and_swap_cache(pages, nr) \ - release_pages((pages), (nr), 0); + release_pages((pages), (nr), false); static inline void show_swap_cache_info(void) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ef1b87f1f..09345ab7fb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1199,7 +1199,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { int i; @@ -1218,7 +1218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); @@ -1379,9 +1379,9 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1413,10 +1413,10 @@ void free_hot_cold_page(struct page *page, int cold) } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1431,7 +1431,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1548,7 +1548,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -2823,7 +2823,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } diff --git a/mm/swap.c b/mm/swap.c index c8d6df556ce..11ebb9714f4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } static void __put_compound_page(struct page *page) @@ -860,7 +860,7 @@ void lru_add_drain_all(void) * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace30d43..2972eee184a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) free_swap_cache(pagep[i]); - release_pages(pagep, todo, 0); + release_pages(pagep, todo, false); pagep += todo; nr -= todo; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9253e188000..494cd632178 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1121,7 +1121,7 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - free_hot_cold_page_list(&free_pages, 1); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1532,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1755,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP -- cgit v1.2.3-70-g09d2 From d8846374a85f4290a473a4e2a64c1ba046c4a0e1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:33 -0700 Subject: mm: page_alloc: calculate classzone_idx once from the zonelist ref There is no need to calculate zone_idx(preferred_zone) multiple times or use the pgdat to figure it out. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 25 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09345ab7fb6..8f785b1534a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1916,11 +1916,10 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ @@ -1928,7 +1927,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -2186,7 +2184,7 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; @@ -2204,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2239,7 +2237,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, enum migrate_mode mode, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2267,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; compaction_defer_reset(preferred_zone, order, true); @@ -2299,7 +2297,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, enum migrate_mode mode, bool *contended_compaction, + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; @@ -2339,7 +2338,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2357,7 +2356,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2380,14 +2380,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2488,7 +2488,7 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -2537,15 +2537,18 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2560,7 +2563,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2591,7 +2594,8 @@ rebalance: */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, - preferred_zone, migratetype, + preferred_zone, + classzone_idx, migratetype, migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); @@ -2621,7 +2625,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2640,7 +2645,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2681,7 +2686,8 @@ rebalance: */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, - preferred_zone, migratetype, + preferred_zone, + classzone_idx, migratetype, migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); @@ -2708,10 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2734,11 +2742,12 @@ retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -2748,7 +2757,7 @@ retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* * The first pass makes sure allocations are spread @@ -2774,7 +2783,7 @@ retry: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); -- cgit v1.2.3-70-g09d2 From 7d018176e6d50510b142bccbd60d8c6ed5e72e56 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 4 Jun 2014 16:10:53 -0700 Subject: mm/page_alloc.c: cleanup add_active_range() related comments add_active_range() has been repalced by memblock_set_node(). Clean up the comments to comply with that change. Signed-off-by: Zhang Zhen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f785b1534a..a59bdb65395 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4387,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative */ int __meminit __early_pfn_to_nid(unsigned long pfn) { @@ -4444,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memblock_free_early_nid() - * manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4469,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -4489,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ @@ -5066,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -5287,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed -- cgit v1.2.3-70-g09d2 From cccad5b983d2b0aa453879591ac4ab1c54ff9db6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Jun 2014 14:38:09 -0700 Subject: mm: convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- mm/page_alloc.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 533fa60c9ac..7d9a4ef0a07 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1664,7 +1664,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a59bdb65395..4f59fa29eda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3389,7 +3389,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5805,7 +5805,7 @@ module_init(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5822,7 +5822,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5838,7 +5838,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5864,7 +5864,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5877,7 +5877,7 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; -- cgit v1.2.3-70-g09d2 From 7cd2b0a34ab8e4db971920eef8982f985441adfb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Jun 2014 13:22:04 -0700 Subject: mm, pcp: allow restoring percpu_pagelist_fraction default Oleg reports a division by zero error on zero-length write() to the percpu_pagelist_fraction sysctl: divide error: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000 RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120 RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246 RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010 RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060 R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800 FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0 Call Trace: proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xba/0x1e0 SyS_write+0x46/0xb0 tracesys+0xe1/0xe6 However, if the percpu_pagelist_fraction sysctl is set by the user, it is also impossible to restore it to the kernel default since the user cannot write 0 to the sysctl. This patch allows the user to write 0 to restore the default behavior. It still requires a fraction equal to or larger than 8, however, as stated by the documentation for sanity. If a value in the range [1, 7] is written, the sysctl will return EINVAL. This successfully solves the divide by zero issue at the same time. Signed-off-by: David Rientjes Reported-by: Oleg Drokin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 3 ++- kernel/sysctl.c | 3 +-- mm/page_alloc.c | 40 ++++++++++++++++++++++++++++------------ 3 files changed, 31 insertions(+), 15 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index bd4b34c0373..4415aa91568 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. +the high water marks for each per cpu page list. If the user writes '0' to this +sysctl, it will revert to this default behavior. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7de6555cfea..075d1903138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59fa29eda..20d17f8266f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; -- cgit v1.2.3-70-g09d2 From dc78327c0ea7da5186d8cbc1647bd6088c5c9fa5 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: mm: page_alloc: fix CMA area initialisation when pageblock > MAX_ORDER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With a kernel configured with ARM64_64K_PAGES && !TRANSPARENT_HUGEPAGE, the following is triggered at early boot: SMP: Total of 8 processors activated. devtmpfs: initialized Unable to handle kernel NULL pointer dereference at virtual address 00000008 pgd = fffffe0000050000 [00000008] *pgd=00000043fba00003, *pmd=00000043fba00003, *pte=00e0000078010407 Internal error: Oops: 96000006 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.15.0-rc864k+ #44 task: fffffe03bc040000 ti: fffffe03bc080000 task.ti: fffffe03bc080000 PC is at __list_add+0x10/0xd4 LR is at free_one_page+0x270/0x638 ... Call trace: __list_add+0x10/0xd4 free_one_page+0x26c/0x638 __free_pages_ok.part.52+0x84/0xbc __free_pages+0x74/0xbc init_cma_reserved_pageblock+0xe8/0x104 cma_init_reserved_areas+0x190/0x1e4 do_one_initcall+0xc4/0x154 kernel_init_freeable+0x204/0x2a8 kernel_init+0xc/0xd4 This happens because init_cma_reserved_pageblock() calls __free_one_page() with pageblock_order as page order but it is bigger than MAX_ORDER. This in turn causes accesses past zone->free_list[]. Fix the problem by changing init_cma_reserved_pageblock() such that it splits pageblock into individual MAX_ORDER pages if pageblock is bigger than a MAX_ORDER page. In cases where !CONFIG_HUGETLB_PAGE_SIZE_VARIABLE, which is all architectures expect for ia64, powerpc and tile at the moment, the “pageblock_order > MAX_ORDER” condition will be optimised out since both sides of the operator are constants. In cases where pageblock size is variable, the performance degradation should not be significant anyway since init_cma_reserved_pageblock() is called only at boot time at most MAX_CMA_AREAS times which by default is eight. Signed-off-by: Michal Nazarewicz Reported-by: Mark Salter Tested-by: Mark Salter Tested-by: Christopher Covington Cc: Mel Gorman Cc: David Rientjes Cc: Marek Szyprowski Cc: Catalin Marinas Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20d17f8266f..0ea758b898f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -816,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); - __free_pages(page, pageblock_order); + + if (pageblock_order >= MAX_ORDER) { + i = pageblock_nr_pages; + p = page; + do { + set_page_refcounted(p); + __free_pages(p, MAX_ORDER - 1); + p += MAX_ORDER_NR_PAGES; + } while (i -= MAX_ORDER_NR_PAGES); + } else { + set_page_refcounted(page); + __free_pages(page, pageblock_order); + } + adjust_managed_page_count(page, pageblock_nr_pages); } #endif -- cgit v1.2.3-70-g09d2