summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c294
1 files changed, 207 insertions, 87 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a873e61e312..a4e1db3f198 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -30,6 +30,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/notifier.h>
#include <linux/topology.h>
@@ -39,6 +40,7 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
#include <linux/sort.h>
@@ -53,6 +55,8 @@
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -286,7 +290,7 @@ static void bad_page(struct page *page)
/* Don't complain about poisoned pages */
if (PageHWPoison(page)) {
- __ClearPageBuddy(page);
+ reset_page_mapcount(page); /* remove PageBuddy */
return;
}
@@ -317,7 +321,7 @@ static void bad_page(struct page *page)
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
- __ClearPageBuddy(page);
+ reset_page_mapcount(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE);
}
@@ -565,7 +569,8 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+ (mem_cgroup_bad_page_check(page)))) {
bad_page(page);
return 1;
}
@@ -614,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
list = &pcp->lists[migratetype];
} while (list_empty(list));
+ /* This is the only non-empty list. Free them all. */
+ if (batch_free == MIGRATE_PCPTYPES)
+ batch_free = to_free;
+
do {
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
@@ -750,7 +759,8 @@ static inline int check_new_page(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+ (mem_cgroup_bad_page_check(page)))) {
bad_page(page);
return 1;
}
@@ -863,9 +873,8 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_del(&page->lru);
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ list_move(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -936,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
- * agressive about taking ownership of free pages
+ * aggressive about taking ownership of free pages
*/
if (unlikely(current_order >= (pageblock_order >> 1)) ||
start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1333,7 +1342,7 @@ again:
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone);
+ zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
@@ -1714,6 +1723,59 @@ try_next_zone:
return page;
}
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+ bool ret = false;
+
+#if NODES_SHIFT > 8
+ ret = in_interrupt();
+#endif
+ return ret;
+}
+
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+ va_list args;
+ unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ return;
+
+ /*
+ * This documents exceptions given to allocations in certain
+ * contexts that are allowed to allocate outside current's set
+ * of allowed nodes.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (test_thread_flag(TIF_MEMDIE) ||
+ (current->flags & (PF_MEMALLOC | PF_EXITING)))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+ if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+
+ if (fmt) {
+ printk(KERN_WARNING);
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+ }
+
+ pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
+
+ dump_stack();
+ if (!should_suppress_show_mem())
+ show_mem(filter);
+}
+
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed)
@@ -2044,6 +2106,7 @@ restart:
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
+rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2051,7 +2114,6 @@ restart:
if (page)
goto got_pg;
-rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2156,13 +2218,7 @@ rebalance:
}
nopage:
- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- printk(KERN_WARNING "%s: page allocation failure."
- " order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
- dump_stack();
- show_mem();
- }
+ warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
@@ -2191,6 +2247,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
+#ifndef CONFIG_ZONE_DMA
+ if (WARN_ON_ONCE(gfp_mask & __GFP_DMA))
+ return NULL;
+#endif
/*
* Check the zones suitable for the gfp_mask contain at least one
@@ -2283,6 +2343,21 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page((void *)addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+ return (void *)addr;
+}
+
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
@@ -2302,22 +2377,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned long addr;
addr = __get_free_pages(gfp_mask, order);
- if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
-
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
- }
-
- return (void *)addr;
+ return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);
/**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ * pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+ unsigned order = get_order(size);
+ struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ if (!p)
+ return NULL;
+ return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
+/**
* free_pages_exact - release memory allocated via alloc_pages_exact()
* @virt: the value returned by alloc_pages_exact.
* @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2411,19 +2497,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
}
#endif
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+ bool ret = false;
+
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ goto out;
+
+ get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ put_mems_allowed();
+out:
+ return ret;
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
*/
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
{
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ continue;
show_node(zone);
printk("%s per-cpu:\n", zone->name);
@@ -2465,6 +2573,8 @@ void show_free_areas(void)
for_each_populated_zone(zone) {
int i;
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ continue;
show_node(zone);
printk("%s"
" free:%lukB"
@@ -2532,6 +2642,8 @@ void show_free_areas(void)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ continue;
show_node(zone);
printk("%s: ", zone->name);
@@ -3110,7 +3222,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -3221,6 +3333,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Mark a number of pageblocks as MIGRATE_RESERVE. The number
* of blocks reserved is based on min_wmark_pages(zone). The memory within
* the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3229,7 +3355,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
*/
static void setup_zone_migrate_reserve(struct zone *zone)
{
- unsigned long start_pfn, pfn, end_pfn;
+ unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
struct page *page;
unsigned long block_migratetype;
int reserve;
@@ -3259,7 +3385,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
continue;
/* Blocks with reserved pages will never free, skip them. */
- if (PageReserved(page))
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
continue;
block_migratetype = get_pageblock_migratetype(page);
@@ -3448,7 +3575,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
{
int cpu;
@@ -3498,7 +3625,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, alloc_size);
+ alloc_bootmem_node_nopanic(pgdat, alloc_size);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -3699,13 +3826,45 @@ void __init free_bootmem_with_active_regions(int nid,
}
#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+ int i;
+
+ for (i = nr_nodemap_entries - 1; i >= 0; i--)
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+ for (index = index - 1; index >= 0; index--)
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+
+ return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+ for (i = last_active_region_index_in_nid(nid); i != -1; \
+ i = previous_active_region_index_in_nid(i, nid))
+
u64 __init find_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
int i;
/* Need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
+ for_each_active_range_index_in_nid_reverse(i, nid) {
u64 addr;
u64 ei_start, ei_last;
u64 final_start, final_end;
@@ -3748,34 +3907,6 @@ int __init add_from_early_node_map(struct range *range, int az,
return nr_range;
}
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
-{
- void *ptr;
- u64 addr;
-
- if (limit > memblock.current_limit)
- limit = memblock.current_limit;
-
- addr = find_memory_core_early(nid, size, align, goal, limit);
-
- if (addr == MEMBLOCK_ERROR)
- return NULL;
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
-}
-#endif
-
-
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
{
int i;
@@ -3856,7 +3987,7 @@ static void __init find_usable_zone_for_movable(void)
/*
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
* the starting point for ZONE_MOVABLE is not fixed. It may be different
* in each node depending on the size of each node and how evenly kernelcore
* is distributed. This helper function adjusts the zone ranges
@@ -4071,7 +4202,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+ zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+ usemapsize);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat,
@@ -4191,10 +4323,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l) {
+ for_each_lru(l)
INIT_LIST_HEAD(&zone->lru[l].list);
- zone->reclaim_stat.nr_saved_scan[l] = 0;
- }
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4237,7 +4367,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node(pgdat, size);
+ map = alloc_bootmem_node_nopanic(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4809,15 +4939,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
dma_reserve = new_dma_reserve;
}
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
@@ -5011,7 +5132,7 @@ void setup_per_zone_wmarks(void)
* 1TB 101 10GB
* 10TB 320 32GB
*/
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
{
unsigned int gb, ratio;
@@ -5025,7 +5146,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
zone->inactive_ratio = ratio;
}
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
{
struct zone *zone;
@@ -5057,7 +5178,7 @@ static void __init setup_per_zone_inactive_ratio(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
@@ -5069,6 +5190,7 @@ static int __init init_per_zone_wmark_min(void)
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_wmarks();
+ refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
@@ -5376,10 +5498,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
- if (!pfn_valid_within(check)) {
- iter++;
+ if (!pfn_valid_within(check))
continue;
- }
+
page = pfn_to_page(check);
if (!page_count(page)) {
if (PageBuddy(page))
@@ -5420,10 +5541,8 @@ int set_migratetype_isolate(struct page *page)
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
- int zone_idx;
zone = page_zone(page);
- zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
@@ -5627,4 +5746,5 @@ void dump_page(struct page *page)
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
+ mem_cgroup_print_bad_page(page);
}