diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 587 |
1 files changed, 327 insertions, 260 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 915dceb487c..c391c320dba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/vmscan.h> + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -213,8 +216,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -242,8 +246,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker, this_scan, + gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -296,7 +301,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi) static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page(page); + lock_page_nosync(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); @@ -396,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } + trace_mm_vmscan_writepage(page, + trace_reclaim_flags(page, sync_writeback)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -615,6 +622,24 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } +static noinline_for_stack void free_page_list(struct list_head *free_pages) +{ + struct pagevec freed_pvec; + struct page *page, *tmp; + + pagevec_init(&freed_pvec, 1); + + list_for_each_entry_safe(page, tmp, free_pages, lru) { + list_del(&page->lru); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + } + + pagevec_free(&freed_pvec); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -623,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); - struct pagevec freed_pvec; + LIST_HEAD(free_pages); int pgactivate = 0; unsigned long nr_reclaimed = 0; cond_resched(); - pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { enum page_references references; struct address_space *mapping; @@ -804,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, __clear_page_locked(page); free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); continue; cull_mlocked: @@ -830,9 +856,10 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + + free_page_list(&free_pages); + list_splice(&ret_pages, page_list); - if (pagevec_count(&freed_pvec)) - __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -914,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; + unsigned long nr_lumpy_taken = 0; + unsigned long nr_lumpy_dirty = 0; + unsigned long nr_lumpy_failed = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { @@ -991,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); nr_taken++; + nr_lumpy_taken++; + if (PageDirty(cursor_page)) + nr_lumpy_dirty++; scan++; + } else { + if (mode == ISOLATE_BOTH && + page_count(cursor_page)) + nr_lumpy_failed++; } } } *scanned = scan; + + trace_mm_vmscan_lru_isolate(order, + nr_to_scan, scan, + nr_taken, + nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, + mode); return nr_taken; } @@ -1033,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list, ClearPageActive(page); nr_active++; } - count[lru]++; + if (count) + count[lru]++; } return nr_active; @@ -1110,177 +1154,215 @@ static int too_many_isolated(struct zone *zone, int file, } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * TODO: Try merging with migrations version of putback_lru_pages */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc, - int priority, int file) +static noinline_for_stack void +putback_lru_pages(struct zone *zone, struct scan_control *sc, + unsigned long nr_anon, unsigned long nr_file, + struct list_head *page_list) { - LIST_HEAD(page_list); + struct page *page; struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + pagevec_init(&pvec, 1); - /* We are about to die and free our memory. Return now. */ - if (fatal_signal_pending(current)) - return SWAP_CLUSTER_MAX; + /* + * Put back any unfreeable pages. + */ + spin_lock(&zone->lru_lock); + while (!list_empty(page_list)) { + int lru; + page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); + list_del(&page->lru); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + reclaim_stat->recent_rotated[file]++; + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); +} - pagevec_init(&pvec, 1); +static noinline_for_stack void update_isolated_counts(struct zone *zone, + struct scan_control *sc, + unsigned long *nr_anon, + unsigned long *nr_file, + struct list_head *isolated_list) +{ + unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - do { - struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; - unsigned long nr_active; - unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE; - unsigned long nr_anon; - unsigned long nr_file; + nr_active = clear_active_flags(isolated_list, count); + __count_vm_events(PGDEACTIVATE, nr_active); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, - sc->order, mode, - zone, 0, file); - zone->pages_scanned += nr_scan; - if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scan); - else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scan); - } else { - nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, - sc->order, mode, - zone, sc->mem_cgroup, - 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ - } + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); - if (nr_taken == 0) - goto done; + *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); - nr_active = clear_active_flags(&page_list, count); - __count_vm_events(PGDEACTIVATE, nr_active); + reclaim_stat->recent_scanned[0] += *nr_anon; + reclaim_stat->recent_scanned[1] += *nr_file; +} + +/* + * Returns true if the caller should wait to clean dirty/writeback pages. + * + * If we are direct reclaiming for contiguous pages and we do not reclaim + * everything in the list, try again and wait for writeback IO to complete. + * This will stall high-order allocations noticeably. Only do that when really + * need to free the pages under high memory pressure. + */ +static inline bool should_reclaim_stall(unsigned long nr_taken, + unsigned long nr_freed, + int priority, + struct scan_control *sc) +{ + int lumpy_stall_priority; - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); + /* kswapd should not stall on sync IO */ + if (current_is_kswapd()) + return false; - nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); + /* Only stall on lumpy reclaim */ + if (!sc->lumpy_reclaim_mode) + return false; - reclaim_stat->recent_scanned[0] += nr_anon; - reclaim_stat->recent_scanned[1] += nr_file; + /* If we have relaimed everything on the isolated list, no stall */ + if (nr_freed == nr_taken) + return false; - spin_unlock_irq(&zone->lru_lock); + /* + * For high-order allocations, there are two stall thresholds. + * High-cost allocations stall immediately where as lower + * order allocations such as stacks require the scanning + * priority to be much higher before stalling. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_stall_priority = DEF_PRIORITY; + else + lumpy_stall_priority = DEF_PRIORITY / 3; - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + return priority <= lumpy_stall_priority; +} +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, + struct scan_control *sc, int priority, int file) +{ + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_active; + unsigned long nr_anon; + unsigned long nr_file; + + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + + if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode ? + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, 0, file); + zone->pages_scanned += nr_scanned; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scanned); + } else { + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode ? + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, sc->mem_cgroup, + 0, file); /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. */ - if (nr_freed < nr_taken && !current_is_kswapd() && - sc->lumpy_reclaim_mode) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + } - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, count); - count_vm_events(PGDEACTIVATE, nr_active); + if (nr_taken == 0) { + spin_unlock_irq(&zone->lru_lock); + return 0; + } - nr_freed += shrink_page_list(&page_list, sc, - PAGEOUT_IO_SYNC); - } + update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + + spin_unlock_irq(&zone->lru_lock); - nr_reclaimed += nr_freed; + nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); - local_irq_disable(); - if (current_is_kswapd()) - __count_vm_events(KSWAPD_STEAL, nr_freed); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); + /* Check if we should syncronously wait for writeback */ + if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { + congestion_wait(BLK_RW_ASYNC, HZ/10); - spin_lock(&zone->lru_lock); /* - * Put back any unfreeable pages. + * The attempt at page out may have made some + * of the pages active, mark them inactive again. */ - while (!list_empty(&page_list)) { - int lru; - page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { - spin_unlock_irq(&zone->lru_lock); - putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); - continue; - } - SetPageLRU(page); - lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; - } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + nr_active = clear_active_flags(&page_list, NULL); + count_vm_events(PGDEACTIVATE, nr_active); + + nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + } - } while (nr_scanned < max_scan); + local_irq_disable(); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_STEAL, nr_reclaimed); + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); -done: - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); return nr_reclaimed; } /* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. - */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) -{ - if (priority < zone->prev_priority) - zone->prev_priority = priority; -} - -/* * This moves pages from the active list to the inactive list. * * We move them the other way if the page is referenced by one or more @@ -1581,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, } /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* * OK, so we have swap space and a fair amount of page cache * pages. We use the recently rotated / recently scanned * ratios to determine how valuable each cache is. @@ -1591,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; - spin_unlock_irq(&zone->lru_lock); } if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[1] /= 2; reclaim_stat->recent_rotated[1] /= 2; - spin_unlock_irq(&zone->lru_lock); } /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; - - /* * The amount of pressure on anon vs file pages is inversely * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. @@ -1622,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -1724,16 +1804,15 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static int shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; - int progress = 0; + bool all_unreclaimable = true; - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - sc->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1743,23 +1822,14 @@ static int shrink_zones(int priority, struct zonelist *zonelist, if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - } else { - /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. - */ - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); } shrink_zone(priority, zone, sc); - progress = 1; + all_unreclaimable = false; } - return progress; + return all_unreclaimable; } /* @@ -1782,13 +1852,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - unsigned long ret = 0; + bool all_unreclaimable; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; get_mems_allowed(); @@ -1796,29 +1864,26 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); - /* - * mem_cgroup will not do shrink_slab. - */ - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - } - } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) disable_swap_token(); - ret = shrink_zones(priority, zonelist, sc); + all_unreclaimable = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ if (scanning_global_lru(sc)) { + unsigned long lru_pages = 0; + for_each_zone_zonelist(zone, z, zonelist, + gfp_zone(sc->gfp_mask)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + lru_pages += zone_reclaimable_pages(zone); + } + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1826,10 +1891,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->nr_to_reclaim) { - ret = sc->nr_reclaimed; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; - } /* * Try to write back as many pages as we just scanned. This @@ -1849,9 +1912,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (ret && scanning_global_lru(sc)) - ret = sc->nr_reclaimed; + out: /* * Now that we've scanned all the zones at this priority level, note @@ -1863,26 +1924,23 @@ out: if (priority < 0) priority = 0; - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - zone->prev_priority = priority; - } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); - delayacct_freepages_end(); put_mems_allowed(); - return ret; + if (sc->nr_reclaimed) + return sc->nr_reclaimed; + + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (scanning_global_lru(sc) && !all_unreclaimable) + return 1; + + return 0; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { + unsigned long nr_reclaimed; struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, @@ -1895,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nodemask = nodemask, }; - return do_try_to_free_pages(zonelist, &sc); + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #ifdef CONFIG_CGROUP_MEM_RES_CTLR @@ -1903,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness, - struct zone *zone, int nid) + struct zone *zone) { struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, @@ -1913,13 +1980,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .order = 0, .mem_cgroup = mem, }; - nodemask_t nm = nodemask_of_node(nid); - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - sc.nodemask = &nm; - sc.nr_reclaimed = 0; - sc.nr_scanned = 0; + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -1928,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed; } @@ -1937,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, unsigned int swappiness) { struct zonelist *zonelist; + unsigned long nr_reclaimed; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1951,7 +2022,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; - return do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #endif @@ -2023,22 +2103,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .order = order, .mem_cgroup = NULL, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to - * free_pages == high_wmark_pages(zone). - */ - int temp_priority[MAX_NR_ZONES]; - loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -2098,7 +2168,6 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; - int nid, zid; if (!populated_zone(zone)) continue; @@ -2106,18 +2175,14 @@ loop_again: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); - nid = pgdat->node_id; - zid = zone_idx(zone); /* * Call soft limit reclaim before calling shrink_zone. * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, - nid, zid); + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -2181,16 +2246,6 @@ loop_again: break; } out: - /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. - */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = temp_priority[i]; - } if (!all_zones_ok) { cond_resched(); @@ -2294,9 +2349,10 @@ static int kswapd(void *p) * premature sleep. If not, then go fully * to sleep until explicitly woken up */ - if (!sleeping_prematurely(pgdat, order, remaining)) + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); schedule(); - else { + } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else @@ -2316,8 +2372,10 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); balance_pgdat(pgdat, order); + } } return 0; } @@ -2337,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) @@ -2585,9 +2644,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .swappiness = vm_swappiness, .order = order, }; - unsigned long slab_reclaimable; + unsigned long nr_slab_pages0, nr_slab_pages1; - disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -2606,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } - slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (slab_reclaimable > zone->min_slab_pages) { + nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages0 > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current @@ -2624,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) > - slab_reclaimable - nr_pages) - ; + for (;;) { + unsigned long lru_pages = zone_reclaimable_pages(zone); + + /* No reclaimable slab or very low memory pressure */ + if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + break; + + /* Freed enough memory */ + nr_slab_pages1 = zone_page_state(zone, + NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) + break; + } /* * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - sc.nr_reclaimed += slab_reclaimable - - zone_page_state(zone, NR_SLAB_RECLAIMABLE); + nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 < nr_slab_pages0) + sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; } p->reclaim_state = NULL; |