diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
commit | bcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch) | |
tree | e420679a5db6ea4e1694eef57f9abb6acac8d4d3 /mm/vmscan.c | |
parent | 26198c21d1b286a084fe5d514a30bc7e6c712a34 (diff) | |
parent | 000078bc3ee69efb1124b8478c7527389a826074 (diff) |
Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:
* Fix include order for bison/flex-generated C files, from Ben Hutchings
* Build fixes and documentation corrections from David Ahern
* Group parsing support, from Jiri Olsa
* UI/gtk refactorings and improvements from Namhyung Kim
* NULL deref fix for perf script, from Namhyung Kim
* Assorted cleanups from Robert Richter
* Let O= makes handle relative paths, from Steven Rostedt
* perf script python fixes, from Feng Tang.
* Improve 'perf lock' error message when the needed tracepoints
are not present, from David Ahern.
* Initial bash completion support, from Frederic Weisbecker
* Allow building without libelf, from Namhyung Kim.
* Support DWARF CFI based unwind to have callchains when %bp
based unwinding is not possible, from Jiri Olsa.
* Symbol resolution fixes, while fixing support PPC64 files with an .opt ELF
section was the end goal, several fixes for code that handles all
architectures and cleanups are included, from Cody Schafer.
* Add a description for the JIT interface, from Andi Kleen.
* Assorted fixes for Documentation and build in 32 bit, from Robert Richter
* Add support for non-tracepoint events in perf script python, from Feng Tang
* Cache the libtraceevent event_format associated to each evsel early, so that we
avoid relookups, i.e. calling pevent_find_event repeatedly when processing
tracepoint events.
[ This is to reduce the surface contact with libtraceevents and make clear what
is that the perf tools needs from that lib: so far parsing the common and per
event fields. ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 180 |
1 files changed, 165 insertions, 15 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 66e431060c0..8d01243d956 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; @@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); + mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { enum page_references references; struct address_space *mapping; @@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); if (PageWriteback(page)) { - nr_writeback++; - unlock_page(page); - goto keep; + /* + * memcg doesn't have any dirty pages throttling so we + * could easily OOM just because too many pages are in + * writeback and there is nothing else to reclaim. + * + * Check __GFP_IO, certainly because a loop driver + * thread might enter reclaim, and deadlock if it waits + * on a page for which it is needed to do the write + * (loop masks off __GFP_IO|__GFP_FS for this reason); + * but more thought would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into + * the FS, just waiting on its writeback completion. + * Worryingly, ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so + * testing may_enter_fs here is liable to OOM on them. + */ + if (global_reclaim(sc) || + !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + /* + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. + */ + SetPageReclaim(page); + nr_writeback++; + goto keep_locked; + } + wait_on_page_writeback(page); } references = page_check_references(page, sc); @@ -921,6 +954,7 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + mem_cgroup_uncharge_end(); *ret_nr_dirty += nr_dirty; *ret_nr_writeback += nr_writeback; return nr_reclaimed; @@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc) * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) @@ -2111,6 +2146,83 @@ out: return 0; } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + pgdat->classzone_idx = min(pgdat->classzone_idx, + (enum zone_type)ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached + */ +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zone *zone; + int high_zoneidx = gfp_zone(gfp_mask); + pg_data_t *pgdat; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + return; + + /* Check if the pfmemalloc reserves are ok */ + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + return; + + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) { + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat), HZ); + return; + } + + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat)); +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = sc.gfp_mask, }; + throttle_direct_reclaim(gfp_mask, zonelist, nodemask); + + /* + * Do not enter reclaim if fatal signal is pending. 1 is returned so + * that the page allocator does not consider triggering OOM + */ + if (fatal_signal_pending(current)) + return 1; + trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); @@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return nr_reclaimed; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, @@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, return balanced_pages >= (present_pages >> 2); } -/* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, int classzone_idx) { int i; @@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return true; + return false; + + /* + * There is a potential race between when kswapd checks its watermarks + * and a process gets throttled. There is also a potential race if + * processes get throttled, kswapd wakes, a large process exits therby + * balancing the zones that causes kswapd to miss a wakeup. If kswapd + * is going to sleep, no process should be sleeping on pfmemalloc_wait + * so wake them now if necessary. If necessary, processes will wake + * kswapd and get throttled again + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) { + wake_up(&pgdat->pfmemalloc_wait); + return false; + } /* Check the watermark levels */ for (i = 0; i <= classzone_idx; i++) { @@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, * must be balanced */ if (order) - return !pgdat_balanced(pgdat, balanced, classzone_idx); + return pgdat_balanced(pgdat, balanced, classzone_idx); else - return !all_zones_ok; + return all_zones_ok; } /* @@ -2537,7 +2677,7 @@ loop_again: * consider it to be no longer congested. It's * possible there are dirty pages backed by * congested BDIs but as pressure is relieved, - * spectulatively avoid congestion waits + * speculatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); if (i <= *classzone_idx) @@ -2545,6 +2685,16 @@ loop_again: } } + + /* + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat)) + wake_up(&pgdat->pfmemalloc_wait); + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* @@ -2646,7 +2796,7 @@ out: } /* - * Return the order we were reclaiming at so sleeping_prematurely() + * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, * if another caller entered the allocator slow path while kswapd * was awake, order will remain at the higher level @@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* |