diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 193 |
1 files changed, 133 insertions, 60 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549f..9f8a97b9a35 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> +#include <linux/memcontrol.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -286,7 +287,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ return; } @@ -317,7 +318,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { + (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -614,6 +616,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, list = &pcp->lists[migratetype]; } while (list_empty(list)); + /* This is the only non-empty list. Free them all. */ + if (batch_free == MIGRATE_PCPTYPES) + batch_free = to_free; + do { page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ @@ -750,7 +756,8 @@ static inline int check_new_page(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { + (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -863,9 +870,8 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_del(&page->lru); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + list_move(&page->lru, + &zone->free_area[order].free_list[migratetype]); page += 1 << order; pages_moved += 1 << order; } @@ -936,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * If breaking a large block of pages, move all free * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more - * agressive about taking ownership of free pages + * aggressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || start_migratetype == MIGRATE_RECLAIMABLE || @@ -1088,8 +1094,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } @@ -1331,7 +1339,7 @@ again: } __count_zone_vm_events(PGALLOC, zone, 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); VM_BUG_ON(bad_range(zone, page)); @@ -1712,6 +1720,20 @@ try_next_zone: return page; } +/* + * Large machines with many possible nodes should not always dump per-node + * meminfo in irq context. + */ +static inline bool should_suppress_show_mem(void) +{ + bool ret = false; + +#if NODES_SHIFT > 8 + ret = in_interrupt(); +#endif + return ret; +} + static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed) @@ -2034,6 +2056,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2075,7 +2105,7 @@ rebalance: sync_migration); if (page) goto got_pg; - sync_migration = true; + sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2147,11 +2177,25 @@ rebalance: nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", + unsigned int filter = SHOW_MEM_FILTER_NODES; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !wait) + filter &= ~SHOW_MEM_FILTER_NODES; + + pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); - show_mem(); + if (!should_suppress_show_mem()) + show_mem(filter); } return page; got_pg: @@ -2192,7 +2236,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; @@ -2399,19 +2445,42 @@ void si_meminfo_node(struct sysinfo *val, int nid) } #endif +/* + * Determine whether the zone's node should be displayed or not, depending on + * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). + */ +static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) +{ + bool ret = false; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + + get_mems_allowed(); + ret = !node_isset(zone->zone_pgdat->node_id, + cpuset_current_mems_allowed); + put_mems_allowed(); +out: + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. + * Suppresses nodes that are not allowed by current's cpuset if + * SHOW_MEM_FILTER_NODES is passed. */ -void show_free_areas(void) +void __show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2453,6 +2522,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { int i; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s" " free:%lukB" @@ -2520,6 +2591,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s: ", zone->name); @@ -2539,6 +2612,11 @@ void show_free_areas(void) show_swap_cache_info(); } +void show_free_areas(void) +{ + __show_free_areas(0); +} + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; @@ -3098,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void build_all_zonelists(void *data) +void __ref build_all_zonelists(void *data) { set_zonelist_order(); @@ -3687,13 +3765,45 @@ void __init free_bootmem_with_active_regions(int nid, } #ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; @@ -3736,34 +3846,6 @@ int __init add_from_early_node_map(struct range *range, int az, return nr_range; } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} -#endif - - void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3844,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void) /* * The zone ranges provided by the architecture do not include ZONE_MOVABLE - * because it is sized independant of architecture. Unlike the other zones, + * because it is sized independent of architecture. Unlike the other zones, * the starting point for ZONE_MOVABLE is not fixed. It may be different * in each node depending on the size of each node and how evenly kernelcore * is distributed. This helper function adjusts the zone ranges @@ -4797,15 +4879,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif - void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, @@ -5364,10 +5437,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) @@ -5615,4 +5687,5 @@ void dump_page(struct page *page) page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + mem_cgroup_print_bad_page(page); } |