summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c720
1 files changed, 545 insertions, 175 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6326c71b66..a873e61e312 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
@@ -49,6 +50,7 @@
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/memory.h>
+#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
@@ -56,6 +58,22 @@
#include <asm/div64.h>
#include "internal.h"
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
/*
* Array of node states.
*/
@@ -86,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
* only be modified with pm_mutex held, unless the suspend/hibernate code is
* guaranteed not to run in parallel with that modification).
*/
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask = mask;
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
}
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
{
- gfp_t ret = gfp_allowed_mask;
-
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask &= ~mask;
- return ret;
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~GFP_IOFS;
}
#endif /* CONFIG_PM_SLEEP */
@@ -334,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
+/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
@@ -403,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -425,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -459,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
@@ -475,6 +491,9 @@ static inline void __free_one_page(struct page *page,
int migratetype)
{
unsigned long page_idx;
+ unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
+ struct page *buddy;
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
@@ -488,10 +507,8 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- unsigned long combined_idx;
- struct page *buddy;
-
- buddy = __page_find_buddy(page, page_idx, order);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
@@ -499,14 +516,36 @@ static inline void __free_one_page(struct page *page,
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+
+ /*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ struct page *higher_page, *higher_buddy;
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = page + (buddy_idx - combined_idx);
+ if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+ list_add_tail(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ goto out;
+ }
+ }
+
+ list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
zone->free_area[order].nr_free++;
}
@@ -551,13 +590,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
+ int to_free = count;
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, count);
- while (count) {
+ while (to_free) {
struct page *page;
struct list_head *list;
@@ -582,8 +621,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, page_private(page));
trace_mm_page_pcpu_drain(page, 0, page_private(page));
- } while (--count && --batch_free && !list_empty(list));
+ } while (--to_free && --batch_free && !list_empty(list));
}
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -594,25 +634,25 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
spin_unlock(&zone->lock);
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static bool free_pages_prepare(struct page *page, unsigned int order)
{
- unsigned long flags;
int i;
int bad = 0;
- int wasMlocked = __TestClearPageMlocked(page);
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0 ; i < (1 << order) ; ++i)
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
bad += free_pages_check(page + i);
if (bad)
- return;
+ return false;
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
+ return true;
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ int wasMlocked = __TestClearPageMlocked(page);
+
+ if (!free_pages_prepare(page, order))
+ return;
+
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
@@ -1037,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- free_pcppages_bulk(zone, pcp->count, pcp);
- pcp->count = 0;
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ pcp->count = 0;
+ }
local_irq_restore(flags);
}
}
@@ -1107,21 +1160,9 @@ void free_hot_cold_page(struct page *page, int cold)
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
- trace_mm_page_free_direct(page, 0);
- kmemcheck_free_shadow(page, 0);
-
- if (PageAnon(page))
- page->mapping = NULL;
- if (free_pages_check(page))
+ if (!free_pages_prepare(page, 0))
return;
- if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
- debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
- }
- arch_free_page(page, 0);
- kernel_map_pages(page, 1, 0);
-
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1188,6 +1229,51 @@ void split_page(struct page *page, unsigned int order)
}
/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ unsigned long watermark;
+ struct zone *zone;
+
+ BUG_ON(!PageBuddy(page));
+
+ zone = page_zone(page);
+ order = page_order(page);
+
+ /* Obey watermarks as if the page was being allocated */
+ watermark = low_wmark_pages(zone) + (1 << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return 0;
+
+ /* Remove page from free list */
+ list_del(&page->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(page);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+
+ if (order >= pageblock_order - 1) {
+ struct page *endpage = page + (1 << order) - 1;
+ for (; page < endpage; page += pageblock_nr_pages)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ }
+
+ return 1 << order;
+}
+
+/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
@@ -1369,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1395,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1654,7 +1759,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct page *page;
/* Acquire the OOM killer lock for the zones in zonelist */
- if (!try_set_zone_oom(zonelist, gfp_mask)) {
+ if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
return NULL;
}
@@ -1675,6 +1780,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -1693,6 +1801,66 @@ out:
return page;
}
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
+{
+ struct page *page;
+
+ if (!order || compaction_deferred(preferred_zone))
+ return NULL;
+
+ current->flags |= PF_MEMALLOC;
+ *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+ nodemask, sync_migration);
+ current->flags &= ~PF_MEMALLOC;
+ if (*did_some_progress != COMPACT_SKIPPED) {
+
+ /* Page migration frees to the PCP lists but we want merging */
+ drain_pages(get_cpu());
+ put_cpu();
+
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags, preferred_zone,
+ migratetype);
+ if (page) {
+ preferred_zone->compact_considered = 0;
+ preferred_zone->compact_defer_shift = 0;
+ count_vm_event(COMPACTSUCCESS);
+ return page;
+ }
+
+ /*
+ * It's bad if compaction run occurs and fails.
+ * The most likely reason is that pages exist,
+ * but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+ defer_compaction(preferred_zone);
+
+ cond_resched();
+ }
+
+ return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
+{
+ return NULL;
+}
+#endif /* CONFIG_COMPACTION */
+
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1702,33 +1870,44 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
+ bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- p->reclaim_state = NULL;
+ current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ current->flags &= ~PF_MEMALLOC;
cond_resched();
- if (order != 0)
- drain_all_pages();
+ if (unlikely(!(*did_some_progress)))
+ return NULL;
- if (likely(*did_some_progress))
- page = get_page_from_freelist(gfp_mask, nodemask, order,
+retry:
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+ * pages are pinned on the per-cpu lists. Drain them and try again
+ */
+ if (!page && !drained) {
+ drain_all_pages();
+ drained = true;
+ goto retry;
+ }
+
return page;
}
@@ -1750,7 +1929,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
@@ -1758,24 +1937,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx)
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- struct task_struct *p = current;
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const gfp_t wait = gfp_mask & __GFP_WAIT;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
- BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
* The caller may dip into page reserves a bit more if the caller
@@ -1783,21 +1962,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags |= (gfp_mask & __GFP_HIGH);
+ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
- alloc_flags |= ALLOC_HARDER;
+ /*
+ * Not worth trying to allocate harder for
+ * __GFP_NOMEMALLOC even if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (!in_interrupt() &&
- ((p->flags & PF_MEMALLOC) ||
+ ((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1816,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- struct task_struct *p = current;
+ bool sync_migration = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -1841,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1850,6 +2036,14 @@ restart:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * Find the true preferred zone if the allocation is unconstrained by
+ * cpusets.
+ */
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+ first_zones_zonelist(zonelist, high_zoneidx, NULL,
+ &preferred_zone);
+
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1872,13 +2066,27 @@ rebalance:
goto nopage;
/* Avoid recursion of direct reclaim */
- if (p->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
+ sync_migration = true;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -1903,15 +2111,23 @@ rebalance:
if (page)
goto got_pg;
- /*
- * The OOM killer does not trigger for high-order
- * ~__GFP_NOFAIL allocations so if no progress is being
- * made, there are no other options and retrying is
- * unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /*
+ * The oom killer is not called for high-order
+ * allocations that may fail, so if no progress
+ * is being made, there are no other options and
+ * retrying is unlikely to help.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto nopage;
+ /*
+ * The oom killer is not called for lowmem
+ * allocations to prevent needlessly killing
+ * innocent tasks.
+ */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto nopage;
+ }
goto restart;
}
@@ -1921,15 +2137,29 @@ rebalance:
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
}
nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
+ current->comm, order, gfp_mask);
dump_stack();
show_mem();
}
@@ -1970,10 +2200,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ get_mems_allowed();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
+ first_zones_zonelist(zonelist, high_zoneidx,
+ nodemask ? : &cpuset_current_mems_allowed,
+ &preferred_zone);
+ if (!preferred_zone) {
+ put_mems_allowed();
return NULL;
+ }
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2218,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
@@ -2402,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- if (s)
- return __parse_numa_zonelist_order(s);
- return 0;
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -2434,8 +2677,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
strncpy((char*)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order)
- build_all_zonelists();
+ } else if (oldval != user_zonelist_order) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
}
out:
mutex_unlock(&zl_order_mutex);
@@ -2582,7 +2828,7 @@ static int default_zonelist_order(void)
* ZONE_DMA and ZONE_DMA32 can be very small area in the system.
* If they are really small and used heavily, the system can fall
* into OOM very easily.
- * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+ * This function detect ZONE_DMA/DMA32 size and configures zone order.
*/
/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
low_kmem_size = 0;
@@ -2594,6 +2840,15 @@ static int default_zonelist_order(void)
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->present_pages;
total_size += z->present_pages;
+ } else if (zone_type == ZONE_NORMAL) {
+ /*
+ * If any node has only lowmem, then node order
+ * is preferred to allow kernel allocations
+ * locally; otherwise, they can easily infringe
+ * on other nodes when there is an abundance of
+ * lowmem available to allocate from.
+ */
+ return ZONELIST_ORDER_NODE;
}
}
}
@@ -2707,6 +2962,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+ struct zone *zone;
+
+ (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ gfp_zone(GFP_KERNEL),
+ NULL,
+ &zone);
+ return zone->node;
+}
+#endif
#else /* CONFIG_NUMA */
@@ -2776,9 +3049,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
{
int nid;
int cpu;
@@ -2806,13 +3086,31 @@ static int __build_all_zonelists(void *dummy)
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+ /*
+ * We now know the "local memory node" for each node--
+ * i.e., the node of the first zone in the generic zonelist.
+ * Set up numa_mem percpu variable for on-line cpus. During
+ * boot, only the boot cpu should be on-line; we'll init the
+ * secondary cpus' numa_mem as they come on-line. During
+ * node/memory hotplug, we'll fixup all on-line cpus.
+ */
+ if (cpu_online(cpu))
+ set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+ }
+
return 0;
}
-void build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -2823,6 +3121,10 @@ void build_all_zonelists(void)
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
+#ifdef CONFIG_MEMORY_HOTPLUG
+ if (data)
+ setup_zone_pageset((struct zone *)data);
+#endif
stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
@@ -3146,31 +3448,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
+static __meminit void setup_zone_pageset(struct zone *zone)
+{
+ int cpu;
+
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
+}
+
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
- * Boot pagesets will no longer be used by this processorr
- * after setup_per_cpu_pageset().
*/
void __init setup_per_cpu_pageset(void)
{
struct zone *zone;
- int cpu;
-
- for_each_populated_zone(zone) {
- zone->pageset = alloc_percpu(struct per_cpu_pageset);
- for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-
- setup_pageset(pcp, zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(pcp,
- (zone->present_pages /
- percpu_pagelist_fraction));
- }
- }
+ for_each_populated_zone(zone)
+ setup_zone_pageset(zone);
}
static noinline __init_refok
@@ -3393,6 +3698,41 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+
+ /* Need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+ u64 final_start, final_end;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+
+ final_start = max(ei_start, goal);
+ final_end = min(ei_last, limit);
+
+ if (final_start >= final_end)
+ continue;
+
+ addr = memblock_find_in_range(final_start, final_end, size, align);
+
+ if (addr == MEMBLOCK_ERROR)
+ continue;
+
+ return addr;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+#endif
+
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
@@ -3412,38 +3752,26 @@ int __init add_from_early_node_map(struct range *range, int az,
void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
- int i;
void *ptr;
+ u64 addr;
- /* need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
- u64 addr;
- u64 ei_start, ei_last;
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
- ei_last = early_node_map[i].end_pfn;
- ei_last <<= PAGE_SHIFT;
- ei_start = early_node_map[i].start_pfn;
- ei_start <<= PAGE_SHIFT;
- addr = find_early_area(ei_start, ei_last,
- goal, limit, size, align);
-
- if (addr == -1ULL)
- continue;
+ addr = find_memory_core_early(nid, size, align, goal, limit);
-#if 0
- printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
- nid,
- ei_start, ei_last, goal, limit, size,
- align, addr);
-#endif
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- reserve_early_without_check(addr, addr + size, "BOOTMEM");
- return ptr;
- }
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
- return NULL;
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
}
#endif
@@ -3746,7 +4074,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
}
#else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
struct zone *zone, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
@@ -3862,8 +4190,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
- zone->prev_priority = DEF_PRIORITY;
-
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
@@ -4933,9 +5259,9 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
tablename,
- (1U << log2qty),
+ (1UL << log2qty),
ilog2(size) - PAGE_SHIFT,
size);
@@ -5032,12 +5358,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* page allocater never alloc memory from ISOLATE block.
*/
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+ unsigned long pfn, iter, found;
+ /*
+ * For avoiding noise data, lru_add_drain_all() should be called
+ * If ZONE_MOVABLE, the zone never contains immobile pages
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ return true;
+
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ return true;
+
+ pfn = page_to_pfn(page);
+ for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ unsigned long check = pfn + iter;
+
+ if (!pfn_valid_within(check)) {
+ iter++;
+ continue;
+ }
+ page = pfn_to_page(check);
+ if (!page_count(page)) {
+ if (PageBuddy(page))
+ iter += (1 << page_order(page)) - 1;
+ continue;
+ }
+ if (!PageLRU(page))
+ found++;
+ /*
+ * If there are RECLAIMABLE pages, we need to check it.
+ * But now, memory offline itself doesn't call shrink_slab()
+ * and it still to be fixed.
+ */
+ /*
+ * If the page is not RAM, page_count()should be 0.
+ * we don't need more check. This is an _used_ not-movable page.
+ *
+ * The problematic thing here is PG_reserved pages. PG_reserved
+ * is set to both of a memory hole page and a _used_ kernel
+ * page at boot.
+ */
+ if (found > count)
+ return false;
+ }
+ return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ return __count_immobile_pages(zone, page, 0);
+}
+
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- struct page *curr_page;
- unsigned long flags, pfn, iter;
- unsigned long immobile = 0;
+ unsigned long flags, pfn;
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
@@ -5047,11 +5426,6 @@ int set_migratetype_isolate(struct page *page)
zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
- zone_idx == ZONE_MOVABLE) {
- ret = 0;
- goto out;
- }
pfn = page_to_pfn(page);
arg.start_pfn = pfn;
@@ -5071,23 +5445,20 @@ int set_migratetype_isolate(struct page *page)
*/
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret || !arg.pages_found)
+ if (notifier_ret)
goto out;
-
- for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- curr_page = pfn_to_page(iter);
- if (!page_count(curr_page) || PageLRU(curr_page))
- continue;
-
- immobile++;
- }
-
- if (arg.pages_found == immobile)
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ if (__count_immobile_pages(zone, page, arg.pages_found))
ret = 0;
+ /*
+ * immobile means "not-on-lru" paes. If immobile is larger than
+ * removable-by-driver pages reported by notifier, we'll fail.
+ */
+
out:
if (!ret) {
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5206,7 +5577,6 @@ static struct trace_print_flags pageflag_names[] = {
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
- {1UL << PG_buddy, "buddy" },
{1UL << PG_swapbacked, "swapbacked" },
{1UL << PG_unevictable, "unevictable" },
#ifdef CONFIG_MMU
@@ -5254,7 +5624,7 @@ void dump_page(struct page *page)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_count(page), page_mapcount(page),
+ page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
}