diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
commit | bcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch) | |
tree | e420679a5db6ea4e1694eef57f9abb6acac8d4d3 /mm/page_alloc.c | |
parent | 26198c21d1b286a084fe5d514a30bc7e6c712a34 (diff) | |
parent | 000078bc3ee69efb1124b8478c7527389a826074 (diff) |
Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:
* Fix include order for bison/flex-generated C files, from Ben Hutchings
* Build fixes and documentation corrections from David Ahern
* Group parsing support, from Jiri Olsa
* UI/gtk refactorings and improvements from Namhyung Kim
* NULL deref fix for perf script, from Namhyung Kim
* Assorted cleanups from Robert Richter
* Let O= makes handle relative paths, from Steven Rostedt
* perf script python fixes, from Feng Tang.
* Improve 'perf lock' error message when the needed tracepoints
are not present, from David Ahern.
* Initial bash completion support, from Frederic Weisbecker
* Allow building without libelf, from Namhyung Kim.
* Support DWARF CFI based unwind to have callchains when %bp
based unwinding is not possible, from Jiri Olsa.
* Symbol resolution fixes, while fixing support PPC64 files with an .opt ELF
section was the end goal, several fixes for code that handles all
architectures and cleanups are included, from Cody Schafer.
* Add a description for the JIT interface, from Andi Kleen.
* Assorted fixes for Documentation and build in 32 bit, from Robert Richter
* Add support for non-tracepoint events in perf script python, from Feng Tang
* Cache the libtraceevent event_format associated to each evsel early, so that we
avoid relookups, i.e. calling pevent_find_event repeatedly when processing
tracepoint events.
[ This is to reduce the surface contact with libtraceevents and make clear what
is that the perf tools needs from that lib: so far parsing the common and per
event fields. ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 318 |
1 files changed, 171 insertions, 147 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a4f9219683..009ac285fea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,7 +51,6 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> -#include <linux/memory.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> @@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +/* + * NOTE: + * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. + * Instead, use {un}set_pageblock_isolate. + */ +void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone, return pages_moved; } -static int move_freepages_block(struct zone *zone, struct page *page, +int move_freepages_block(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn; @@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; + if (to_drain > 0) { + free_pcppages_bulk(zone, to_drain, pcp); + pcp->count -= to_drain; + } local_irq_restore(flags); } #endif @@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) } __setup("fail_page_alloc=", setup_fail_page_alloc); -static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { if (order < fail_page_alloc.min_order) - return 0; + return false; if (gfp_mask & __GFP_NOFAIL) - return 0; + return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return 0; + return false; if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) - return 0; + return false; return should_fail(&fail_page_alloc.attr, 1 << order); } @@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { - return 0; + return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ @@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; + long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; free_pages -= (1 << order) - 1; @@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, if (alloc_flags & ALLOC_HARDER) min -= min / 4; - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } +#ifdef CONFIG_MEMORY_ISOLATION +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ + if (unlikely(zone->nr_pageblock_isolate)) + return zone->nr_pageblock_isolate * pageblock_nr_pages; + return 0; +} +#else +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ + return 0; +} +#endif + bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + /* + * If the zone has MIGRATE_ISOLATE type free pages, we should consider + * it. nr_zone_isolate_freepages is never accurate so kswapd might not + * sleep although it could do so. But this is more desirable for memory + * hotplug than sleeping which can cause a livelock in the direct + * reclaim path. + */ + free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } @@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) { preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; @@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) + if (gfp_mask & __GFP_MEMALLOC) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } return alloc_flags; } +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2340,11 +2378,27 @@ rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds + * the allocation is high priority and these type of + * allocations are system rather than user orientated + */ + zonelist = node_zonelist(numa_node_id(), gfp_mask); + page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - if (page) + if (page) { + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is + * that the caller is taking steps that will free more + * memory. The caller should avoid the page being used + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = true; goto got_pg; + } } /* Atomic allocations - we can't balance anything */ @@ -2463,8 +2517,8 @@ nopage: got_pg: if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; + return page; } /* @@ -2515,6 +2569,8 @@ retry_cpuset: page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + else + page->pfmemalloc = false; trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } } @@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone); DEFINE_MUTEX(zonelists_mutex); /* return values int ....just for stop_machine() */ -static __init_refok int __build_all_zonelists(void *data) +static int __build_all_zonelists(void *data) { int nid; int cpu; + pg_data_t *self = data; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + + if (self && !node_online(self->node_id)) { + build_zonelists(self); + build_zonelist_cache(self); + } + for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void __ref build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); @@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data) /* we have to stop all cpus to guarantee there is no user of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG - if (data) - setup_zone_pageset((struct zone *)data); + if (zone) + setup_zone_pageset(zone); #endif - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } -static void setup_zone_pageset(struct zone *zone) +static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; @@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } -static int __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - free_pcppages_bulk(zone, pcp->count, pcp); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - -void zone_pcp_update(struct zone *zone) -{ - stop_machine(__zone_pcp_update, zone, NULL); -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -__meminit int init_currently_empty_zone(struct zone *zone, +int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size, enum memmap_context context) @@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -static inline void set_pageblock_order(void) +void __init set_pageblock_order(void) { } @@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void) * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); - pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); - pgdat->kswapd_max_order = 0; + init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + zone->compact_cached_free_pfn = zone->zone_start_pfn + + zone->spanned_pages; + zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); +#endif #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) @@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); lruvec_init(&zone->lruvec, zone); - zap_zone_vm_stats(zone); - zone->flags = 0; if (!size) continue; @@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, { pg_data_t *pgdat = NODE_DATA(nid); + /* pg_data_t should be reset to zero when it's allocated */ + WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size); @@ -4750,7 +4794,7 @@ out: } /* Any regular memory on that node ? */ -static void check_for_regular_memory(pg_data_t *pgdat) +static void __init check_for_regular_memory(pg_data_t *pgdat) { #ifdef CONFIG_HIGHMEM enum zone_type zone_type; @@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, } /* - * This is designed as sub function...plz see page_isolation.c also. - * set/clear page block's type to be ISOLATE. - * page allocater never alloc memory from ISOLATE block. + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check wihtout isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact. */ - -static int -__count_immobile_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; int mt; /* * For avoiding noise data, lru_add_drain_all() should be called - * If ZONE_MOVABLE, the zone never contains immobile pages + * If ZONE_MOVABLE, the zone never contains unmovable pages */ if (zone_idx(zone) == ZONE_MOVABLE) - return true; + return false; mt = get_pageblock_migratetype(page); if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) - return true; + return false; pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { @@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) continue; page = pfn_to_page(check); - if (!page_count(page)) { + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_count is zero at all time. + */ + if (!atomic_read(&page->_count)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; } + if (!PageLRU(page)) found++; /* @@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) * page at boot. */ if (found > count) - return false; + return true; } - return true; + return false; } bool is_pageblock_removable_nolock(struct page *page) @@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; - return __count_immobile_pages(zone, page, 0); -} - -int set_migratetype_isolate(struct page *page) -{ - struct zone *zone; - unsigned long flags, pfn; - struct memory_isolate_notify arg; - int notifier_ret; - int ret = -EBUSY; - - zone = page_zone(page); - - spin_lock_irqsave(&zone->lock, flags); - - pfn = page_to_pfn(page); - arg.start_pfn = pfn; - arg.nr_pages = pageblock_nr_pages; - arg.pages_found = 0; - - /* - * It may be possible to isolate a pageblock even if the - * migratetype is not MIGRATE_MOVABLE. The memory isolation - * notifier chain is used by balloon drivers to return the - * number of pages in a range that are held by the balloon - * driver to shrink memory. If all the pages are accounted for - * by balloons, are free, or on the LRU, isolation can continue. - * Later, for example, when memory hotplug notifier runs, these - * pages reported as "can be isolated" should be isolated(freed) - * by the balloon driver through the memory notifier chain. - */ - notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); - notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret) - goto out; - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. - */ - if (__count_immobile_pages(zone, page, arg.pages_found)) - ret = 0; - - /* - * immobile means "not-on-lru" paes. If immobile is larger than - * removable-by-driver pages reported by notifier, we'll fail. - */ - -out: - if (!ret) { - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - } - - spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) - drain_all_pages(); - return ret; -} - -void unset_migratetype_isolate(struct page *page, unsigned migratetype) -{ - struct zone *zone; - unsigned long flags; - zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - goto out; - set_pageblock_migratetype(page, migratetype); - move_freepages_block(zone, page, migratetype); -out: - spin_unlock_irqrestore(&zone->lock, flags); + return !has_unmovable_pages(zone, page, 0); } #ifdef CONFIG_CMA @@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif +#ifdef CONFIG_MEMORY_HOTPLUG +static int __meminit __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + if (pcp->count > 0) + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void __meminit zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE +void zone_pcp_reset(struct zone *zone) +{ + unsigned long flags; + + /* avoid races with drain_pages() */ + local_irq_save(flags); + if (zone->pageset != &boot_pageset) { + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } + local_irq_restore(flags); +} + /* * All pages in the range must be isolated before calling this. */ |