diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 634 |
1 files changed, 357 insertions, 277 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c..18cee0d4c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -261,8 +262,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) } while (zone_span_seqretry(zone, seq)); if (ret) - pr_err("page %lu outside zone [ %lu - %lu ]\n", - pfn, start_pfn, start_pfn + sp); + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); return ret; } @@ -408,7 +410,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -452,7 +455,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -503,21 +506,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; @@ -549,6 +562,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -565,7 +579,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -666,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -700,7 +717,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -712,13 +729,18 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); @@ -755,15 +777,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -799,9 +822,21 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); - __free_pages(page, pageblock_order); + + if (pageblock_order >= MAX_ORDER) { + i = pageblock_nr_pages; + p = page; + do { + set_page_refcounted(p); + __free_pages(p, MAX_ORDER - 1); + p += MAX_ORDER_NR_PAGES; + } while (i -= MAX_ORDER_NR_PAGES); + } else { + set_page_refcounted(page); + __free_pages(page, pageblock_order); + } + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -882,7 +917,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -931,6 +966,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1057,7 +1093,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1090,16 +1128,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1125,6 +1164,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1173,9 +1218,9 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1192,18 +1237,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1224,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; @@ -1327,7 +1362,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1359,19 +1394,20 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1385,17 +1421,17 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1410,7 +1446,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1522,12 +1558,12 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -1572,10 +1608,13 @@ again: if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1672,12 +1711,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1692,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1707,15 +1746,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -1850,18 +1889,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ - int i; - - for_each_node_state(i, N_MEMORY) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) - node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -1895,10 +1924,19 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return true; } -static inline void init_zone_allows_reclaim(int nid) +#endif /* CONFIG_NUMA */ + +static void reset_alloc_batches(struct zone *preferred_zone) { + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); } -#endif /* CONFIG_NUMA */ /* * get_page_from_freelist goes through the zonelist trying to allocate @@ -1907,18 +1945,22 @@ static inline void init_zone_allows_reclaim(int nid) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; - classzone_idx = zone_idx(preferred_zone); zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1930,12 +1972,10 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a @@ -1944,9 +1984,11 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + } } /* * When allocating a page cache page for writing, we @@ -1974,15 +2016,19 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + if (consider_zone_dirty && !zone_dirty_ok(zone)) + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* @@ -2044,17 +2090,11 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2063,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2173,12 +2242,12 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -2191,7 +2260,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2216,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } @@ -2226,7 +2295,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2240,7 +2309,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2254,7 +2323,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; compaction_defer_reset(preferred_zone, order, true); @@ -2273,7 +2342,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2286,9 +2355,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2327,7 +2396,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2345,7 +2414,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2368,14 +2438,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2384,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2422,7 +2470,7 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const gfp_t wait = gfp_mask & __GFP_WAIT; + const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2431,20 +2479,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (!wait) { + if (atomic) { /* - * Not worth trying to allocate harder for - * __GFP_NOMEMALLOC even if it can't schedule. + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (!(gfp_mask & __GFP_NOMEMALLOC)) alloc_flags |= ALLOC_HARDER; /* - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the + * comment for __cpuset_node_allowed_softwall(). */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && !in_interrupt()) @@ -2476,14 +2524,14 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2525,15 +2573,18 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2548,7 +2599,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2577,17 +2628,15 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; /* * If compaction is deferred for high-order allocations, it is because @@ -2599,12 +2648,22 @@ rebalance: (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2623,7 +2682,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2662,12 +2721,11 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2693,11 +2751,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2716,50 +2775,27 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2767,7 +2803,7 @@ retry: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2782,8 +2818,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2818,7 +2852,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } @@ -2837,27 +2871,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. + */ +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } @@ -2911,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2919,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -3001,7 +3058,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3021,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3202,12 +3260,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -3351,7 +3409,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -4095,7 +4153,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -4107,7 +4165,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4223,8 +4281,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -4349,9 +4407,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative */ int __meminit __early_pfn_to_nid(unsigned long pfn) { @@ -4406,10 +4461,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memblock_free_early_nid() - * manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4431,9 +4485,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -4451,7 +4504,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ @@ -4921,8 +4974,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - if (node_state(nid, N_MEMORY)) - init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif @@ -5030,7 +5081,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -5251,7 +5302,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed @@ -5535,7 +5586,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -5774,7 +5825,7 @@ module_init(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5791,7 +5842,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5807,7 +5858,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5833,7 +5884,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5846,27 +5897,42 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; @@ -6003,59 +6069,73 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) } /** - * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest to retrieve - * @end_bitidx: The last bit of interest - * returns pageblock_bits flags + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest - * @end_bitidx: The last bit of interest * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* @@ -6215,7 +6295,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6254,7 +6334,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); @@ -6409,7 +6489,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6461,7 +6541,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { |