diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 344 |
1 files changed, 235 insertions, 109 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f8a97b9a35..1dbcf8888f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -30,6 +30,7 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include <linux/oom.h> #include <linux/notifier.h> #include <linux/topology.h> @@ -39,6 +40,7 @@ #include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmalloc.h> +#include <linux/vmstat.h> #include <linux/mempolicy.h> #include <linux/stop_machine.h> #include <linux/sort.h> @@ -54,6 +56,7 @@ #include <trace/events/kmem.h> #include <linux/ftrace_event.h> #include <linux/memcontrol.h> +#include <linux/prefetch.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1367,21 +1370,12 @@ failed: #ifdef CONFIG_FAIL_PAGE_ALLOC -static struct fail_page_alloc_attr { +static struct { struct fault_attr attr; u32 ignore_gfp_highmem; u32 ignore_gfp_wait; u32 min_order; - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - - struct dentry *ignore_gfp_highmem_file; - struct dentry *ignore_gfp_wait_file; - struct dentry *min_order_file; - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, @@ -1421,30 +1415,24 @@ static int __init fail_page_alloc_debugfs(void) "fail_page_alloc"); if (err) return err; - dir = fail_page_alloc.attr.dentries.dir; - - fail_page_alloc.ignore_gfp_wait_file = - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait); - - fail_page_alloc.ignore_gfp_highmem_file = - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - fail_page_alloc.min_order_file = - debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order); - - if (!fail_page_alloc.ignore_gfp_wait_file || - !fail_page_alloc.ignore_gfp_highmem_file || - !fail_page_alloc.min_order_file) { - err = -ENOMEM; - debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); - debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); - debugfs_remove(fail_page_alloc.min_order_file); - cleanup_fault_attr_dentries(&fail_page_alloc.attr); - } - return err; + dir = fail_page_alloc.attr.dir; + + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem)) + goto fail; + if (!debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order)) + goto fail; + + return 0; +fail: + cleanup_fault_attr_dentries(&fail_page_alloc.attr); + + return -ENOMEM; } late_initcall(fail_page_alloc_debugfs); @@ -1613,6 +1601,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) set_bit(i, zlc->fullzones); } +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1629,6 +1632,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1661,7 +1668,7 @@ zonelist_scan: continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) - goto try_next_zone; + continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1673,17 +1680,36 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup if there are multiple nodes + * and before considering the first zone allowed + * by the cpuset. + */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + if (zone_reclaim_mode == 0) goto this_zone_full; + /* + * As we may have just activated ZLC, check if the first + * eligible zone has failed zone_reclaim recently. + */ + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + ret = zone_reclaim(zone, gfp_mask, order); switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ - goto try_next_zone; + continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ - goto this_zone_full; + continue; default: /* did we reclaim enough */ if (!zone_watermark_ok(zone, order, mark, @@ -1700,16 +1726,6 @@ try_this_zone: this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); -try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup after the first zone is tried but only - * if there are multiple nodes make it worthwhile - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } } if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { @@ -1734,6 +1750,45 @@ static inline bool should_suppress_show_mem(void) return ret; } +static DEFINE_RATELIMIT_STATE(nopage_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +{ + va_list args; + unsigned int filter = SHOW_MEM_FILTER_NODES; + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + return; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + filter &= ~SHOW_MEM_FILTER_NODES; + + if (fmt) { + printk(KERN_WARNING); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + } + + pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); + + dump_stack(); + if (!should_suppress_show_mem()) + show_mem(filter); +} + static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed) @@ -1912,6 +1967,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; + /* After successful reclaim, reconsider all zones for allocation */ + if (NUMA_BUILD) + zlc_clear_zones_full(zonelist); + retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, @@ -2064,6 +2123,7 @@ restart: first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone); +rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2071,7 +2131,6 @@ restart: if (page) goto got_pg; -rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { page = __alloc_pages_high_priority(gfp_mask, order, @@ -2105,7 +2164,7 @@ rebalance: sync_migration); if (page) goto got_pg; - sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2176,27 +2235,7 @@ rebalance: } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - unsigned int filter = SHOW_MEM_FILTER_NODES; - - /* - * This documents exceptions given to allocations in certain - * contexts that are allowed to allocate outside current's set - * of allowed nodes. - */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || - (current->flags & (PF_MEMALLOC | PF_EXITING))) - filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !wait) - filter &= ~SHOW_MEM_FILTER_NODES; - - pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); - dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); - } + warn_alloc_failed(gfp_mask, order, NULL); return page; got_pg: if (kmemcheck_enabled) @@ -2317,6 +2356,21 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +{ + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page((void *)addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + return (void *)addr; +} + /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate @@ -2336,22 +2390,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long addr; addr = __get_free_pages(gfp_mask, order); - if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); - - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } - } - - return (void *)addr; + return make_alloc_exact(addr, order, size); } EXPORT_SYMBOL(alloc_pages_exact); /** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * pages on a node. + * @nid: the preferred node ID where memory should be allocated + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * Note this is not alloc_pages_exact_node() which allocates on a specific node, + * but is not exact. + */ +void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ + unsigned order = get_order(size); + struct page *p = alloc_pages_node(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +} +EXPORT_SYMBOL(alloc_pages_exact_nid); + +/** * free_pages_exact - release memory allocated via alloc_pages_exact() * @virt: the value returned by alloc_pages_exact. * @size: size of allocation, same value as passed to alloc_pages_exact(). @@ -2446,10 +2511,10 @@ void si_meminfo_node(struct sysinfo *val, int nid) #endif /* - * Determine whether the zone's node should be displayed or not, depending on - * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) +bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; @@ -2457,8 +2522,7 @@ static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) goto out; get_mems_allowed(); - ret = !node_isset(zone->zone_pgdat->node_id, - cpuset_current_mems_allowed); + ret = !node_isset(nid, cpuset_current_mems_allowed); put_mems_allowed(); out: return ret; @@ -2473,13 +2537,13 @@ out: * Suppresses nodes that are not allowed by current's cpuset if * SHOW_MEM_FILTER_NODES is passed. */ -void __show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { - if (skip_free_areas_zone(filter, zone)) + if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2522,7 +2586,7 @@ void __show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_zone(filter, zone)) + if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s" @@ -2591,7 +2655,7 @@ void __show_free_areas(unsigned int filter) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; - if (skip_free_areas_zone(filter, zone)) + if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s: ", zone->name); @@ -2612,11 +2676,6 @@ void __show_free_areas(unsigned int filter) show_swap_cache_info(); } -void show_free_areas(void) -{ - __show_free_areas(0); -} - static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; @@ -3287,6 +3346,20 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* + * Check if a pageblock contains reserved pages + */ +static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) + return 1; + } + return 0; +} + +/* * Mark a number of pageblocks as MIGRATE_RESERVE. The number * of blocks reserved is based on min_wmark_pages(zone). The memory within * the reserve will tend to store contiguous free pages. Setting min_free_kbytes @@ -3295,7 +3368,7 @@ static inline unsigned long wait_table_bits(unsigned long size) */ static void setup_zone_migrate_reserve(struct zone *zone) { - unsigned long start_pfn, pfn, end_pfn; + unsigned long start_pfn, pfn, end_pfn, block_end_pfn; struct page *page; unsigned long block_migratetype; int reserve; @@ -3325,7 +3398,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; /* Blocks with reserved pages will never free, skip them. */ - if (PageReserved(page)) + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) continue; block_migratetype = get_pageblock_migratetype(page); @@ -3514,7 +3588,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } -static __meminit void setup_zone_pageset(struct zone *zone) +static void setup_zone_pageset(struct zone *zone) { int cpu; @@ -3564,7 +3638,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, alloc_size); + alloc_bootmem_node_nopanic(pgdat, alloc_size); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -4141,7 +4215,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); + zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, + usemapsize); } #else static inline void setup_usemap(struct pglist_data *pgdat, @@ -4261,10 +4336,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) { + for_each_lru(l) INIT_LIST_HEAD(&zone->lru[l].list); - zone->reclaim_stat.nr_saved_scan[l] = 0; - } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4307,7 +4380,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node(pgdat, size); + map = alloc_bootmem_node_nopanic(pgdat, size); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -4529,6 +4602,60 @@ void __init sort_node_map(void) cmp_node_active_region, NULL); } +/** + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the + * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map. + * + * Returns the determined alignment in pfn's. 0 if there is no alignment + * requirement (single node). + */ +unsigned long __init node_map_pfn_alignment(void) +{ + unsigned long accl_mask = 0, last_end = 0; + int last_nid = -1; + int i; + + for_each_active_range_index_in_nid(i, MAX_NUMNODES) { + int nid = early_node_map[i].nid; + unsigned long start = early_node_map[i].start_pfn; + unsigned long end = early_node_map[i].end_pfn; + unsigned long mask; + + if (!start || last_nid < 0 || last_nid == nid) { + last_nid = nid; + last_end = end; + continue; + } + + /* + * Start with a mask granular enough to pin-point to the + * start pfn and tick off bits one-by-one until it becomes + * too coarse to separate the current node from the last. + */ + mask = ~((1 << __ffs(start)) - 1); + while (mask && last_end <= (start & (mask << 1))) + mask <<= 1; + + /* accumulate all internode masks */ + accl_mask |= mask; + } + + /* convert mask to number of pages */ + return ~accl_mask + 1; +} + /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { @@ -5072,7 +5199,7 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -void calculate_zone_inactive_ratio(struct zone *zone) +static void __meminit calculate_zone_inactive_ratio(struct zone *zone) { unsigned int gb, ratio; @@ -5086,7 +5213,7 @@ void calculate_zone_inactive_ratio(struct zone *zone) zone->inactive_ratio = ratio; } -static void __init setup_per_zone_inactive_ratio(void) +static void __meminit setup_per_zone_inactive_ratio(void) { struct zone *zone; @@ -5118,7 +5245,7 @@ static void __init setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_wmark_min(void) +int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -5130,6 +5257,7 @@ static int __init init_per_zone_wmark_min(void) if (min_free_kbytes > 65536) min_free_kbytes = 65536; setup_per_zone_wmarks(); + refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; @@ -5480,10 +5608,8 @@ int set_migratetype_isolate(struct page *page) struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; - int zone_idx; zone = page_zone(page); - zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); |