diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 754 |
1 files changed, 417 insertions, 337 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ff3311447f..c391c320dba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/vmscan.h> + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -73,10 +76,14 @@ struct scan_control { int swappiness; - int all_unreclaimable; - int order; + /* + * Intend to reclaim enough contenious memory rather than to reclaim + * enough amount memory. I.e, it's the mode for high order allocation. + */ + bool lumpy_reclaim_mode; + /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -85,12 +92,6 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; - - /* Pluggable isolate pages callback */ - unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, - unsigned long *scanned, int order, int mode, - struct zone *z, struct mem_cgroup *mem_cont, - int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -215,8 +216,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -244,8 +246,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker, this_scan, + gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -298,7 +301,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi) static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page(page); + lock_page_nosync(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); @@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } + trace_mm_vmscan_writepage(page, + trace_reclaim_flags(page, sync_writeback)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -575,7 +580,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + if (sc->lumpy_reclaim_mode) return PAGEREF_RECLAIM; /* @@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } +static noinline_for_stack void free_page_list(struct list_head *free_pages) +{ + struct pagevec freed_pvec; + struct page *page, *tmp; + + pagevec_init(&freed_pvec, 1); + + list_for_each_entry_safe(page, tmp, free_pages, lru) { + list_del(&page->lru); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + } + + pagevec_free(&freed_pvec); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); - struct pagevec freed_pvec; + LIST_HEAD(free_pages); int pgactivate = 0; unsigned long nr_reclaimed = 0; cond_resched(); - pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { enum page_references references; struct address_space *mapping; @@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, __clear_page_locked(page); free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); continue; cull_mlocked: @@ -832,18 +856,14 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + + free_page_list(&free_pages); + list_splice(&ret_pages, page_list); - if (pagevec_count(&freed_pvec)) - __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ - /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -921,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; + unsigned long nr_lumpy_taken = 0; + unsigned long nr_lumpy_dirty = 0; + unsigned long nr_lumpy_failed = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { @@ -998,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); nr_taken++; + nr_lumpy_taken++; + if (PageDirty(cursor_page)) + nr_lumpy_dirty++; scan++; + } else { + if (mode == ISOLATE_BOTH && + page_count(cursor_page)) + nr_lumpy_failed++; } } } *scanned = scan; + + trace_mm_vmscan_lru_isolate(order, + nr_to_scan, scan, + nr_taken, + nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, + mode); return nr_taken; } @@ -1011,7 +1047,6 @@ static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, - struct mem_cgroup *mem_cont, int active, int file) { int lru = LRU_BASE; @@ -1041,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list, ClearPageActive(page); nr_active++; } - count[lru]++; + if (count) + count[lru]++; } return nr_active; @@ -1118,176 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file, } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * TODO: Try merging with migrations version of putback_lru_pages */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc, - int priority, int file) +static noinline_for_stack void +putback_lru_pages(struct zone *zone, struct scan_control *sc, + unsigned long nr_anon, unsigned long nr_file, + struct list_head *page_list) { - LIST_HEAD(page_list); + struct page *page; struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int lumpy_reclaim = 0; - - while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - /* We are about to die and free our memory. Return now. */ - if (fatal_signal_pending(current)) - return SWAP_CLUSTER_MAX; - } + pagevec_init(&pvec, 1); /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. + * Put back any unfreeable pages. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_reclaim = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - lumpy_reclaim = 1; + spin_lock(&zone->lru_lock); + while (!list_empty(page_list)) { + int lru; + page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); + list_del(&page->lru); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + reclaim_stat->recent_rotated[file]++; + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - pagevec_init(&pvec, 1); + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); +} - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - do { - struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; - unsigned long nr_active; - unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; - unsigned long nr_anon; - unsigned long nr_file; - - nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, - &page_list, &nr_scan, sc->order, mode, - zone, sc->mem_cgroup, 0, file); +static noinline_for_stack void update_isolated_counts(struct zone *zone, + struct scan_control *sc, + unsigned long *nr_anon, + unsigned long *nr_file, + struct list_head *isolated_list) +{ + unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - if (scanning_global_lru(sc)) { - zone->pages_scanned += nr_scan; - if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scan); - else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scan); - } + nr_active = clear_active_flags(isolated_list, count); + __count_vm_events(PGDEACTIVATE, nr_active); + + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); + + reclaim_stat->recent_scanned[0] += *nr_anon; + reclaim_stat->recent_scanned[1] += *nr_file; +} - if (nr_taken == 0) - goto done; +/* + * Returns true if the caller should wait to clean dirty/writeback pages. + * + * If we are direct reclaiming for contiguous pages and we do not reclaim + * everything in the list, try again and wait for writeback IO to complete. + * This will stall high-order allocations noticeably. Only do that when really + * need to free the pages under high memory pressure. + */ +static inline bool should_reclaim_stall(unsigned long nr_taken, + unsigned long nr_freed, + int priority, + struct scan_control *sc) +{ + int lumpy_stall_priority; - nr_active = clear_active_flags(&page_list, count); - __count_vm_events(PGDEACTIVATE, nr_active); + /* kswapd should not stall on sync IO */ + if (current_is_kswapd()) + return false; - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); + /* Only stall on lumpy reclaim */ + if (!sc->lumpy_reclaim_mode) + return false; - nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); + /* If we have relaimed everything on the isolated list, no stall */ + if (nr_freed == nr_taken) + return false; - reclaim_stat->recent_scanned[0] += nr_anon; - reclaim_stat->recent_scanned[1] += nr_file; + /* + * For high-order allocations, there are two stall thresholds. + * High-cost allocations stall immediately where as lower + * order allocations such as stacks require the scanning + * priority to be much higher before stalling. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_stall_priority = DEF_PRIORITY; + else + lumpy_stall_priority = DEF_PRIORITY / 3; - spin_unlock_irq(&zone->lru_lock); + return priority <= lumpy_stall_priority; +} - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, + struct scan_control *sc, int priority, int file) +{ + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_active; + unsigned long nr_anon; + unsigned long nr_file; + + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode ? + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, 0, file); + zone->pages_scanned += nr_scanned; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scanned); + } else { + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode ? + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, sc->mem_cgroup, + 0, file); /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. */ - if (nr_freed < nr_taken && !current_is_kswapd() && - lumpy_reclaim) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + } - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, count); - count_vm_events(PGDEACTIVATE, nr_active); + if (nr_taken == 0) { + spin_unlock_irq(&zone->lru_lock); + return 0; + } - nr_freed += shrink_page_list(&page_list, sc, - PAGEOUT_IO_SYNC); - } + update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); - nr_reclaimed += nr_freed; + spin_unlock_irq(&zone->lru_lock); - local_irq_disable(); - if (current_is_kswapd()) - __count_vm_events(KSWAPD_STEAL, nr_freed); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); + nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + + /* Check if we should syncronously wait for writeback */ + if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { + congestion_wait(BLK_RW_ASYNC, HZ/10); - spin_lock(&zone->lru_lock); /* - * Put back any unfreeable pages. + * The attempt at page out may have made some + * of the pages active, mark them inactive again. */ - while (!list_empty(&page_list)) { - int lru; - page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { - spin_unlock_irq(&zone->lru_lock); - putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); - continue; - } - SetPageLRU(page); - lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; - } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + nr_active = clear_active_flags(&page_list, NULL); + count_vm_events(PGDEACTIVATE, nr_active); - } while (nr_scanned < max_scan); + nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + } -done: - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); - return nr_reclaimed; -} + local_irq_disable(); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_STEAL, nr_reclaimed); + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); -/* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. - */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) -{ - if (priority < zone->prev_priority) - zone->prev_priority = priority; + putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + return nr_reclaimed; } /* @@ -1356,16 +1428,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, lru_add_drain(); spin_lock_irq(&zone->lru_lock); - nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1, file); - /* - * zone->pages_scanned is used for detect zone's oom - * mem_cgroup remembers nr_scan by itself. - */ if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + 1, file); zone->pages_scanned += pgscanned; + } else { + nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); @@ -1519,21 +1598,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, } /* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= SWAP_CLUSTER_MAX) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} + +/* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * percent[0] specifies how much pressure to put on ram/swap backed - * memory, while percent[1] determines pressure on the file LRUs. + * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_ratio(struct zone *zone, struct scan_control *sc, - unsigned long *percent) +static void get_scan_count(struct zone *zone, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + u64 fraction[2], denominator; + enum lru_list l; + int noswap = 0; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; + } anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); @@ -1545,13 +1655,21 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { - percent[0] = 100; - percent[1] = 0; - return; + fraction[0] = 1; + fraction[1] = 0; + denominator = 1; + goto out; } } /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* * OK, so we have swap space and a fair amount of page cache * pages. We use the recently rotated / recently scanned * ratios to determine how valuable each cache is. @@ -1562,28 +1680,18 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; - spin_unlock_irq(&zone->lru_lock); } if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[1] /= 2; reclaim_stat->recent_rotated[1] /= 2; - spin_unlock_irq(&zone->lru_lock); } /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; - - /* * The amount of pressure on anon vs file pages is inversely * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. @@ -1593,30 +1701,39 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + for_each_evictable_lru(l) { + int file = is_file_lru(l); + unsigned long scan; - /* Normalize to percentages */ - percent[0] = 100 * ap / (ap + fp + 1); - percent[1] = 100 - percent[0]; + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { + scan >>= priority; + scan = div64_u64(scan * fraction[file], denominator); + } + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l]); + } } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) { - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = 1; else - nr = 0; - - return nr; + sc->lumpy_reclaim_mode = 0; } /* @@ -1627,33 +1744,13 @@ static void shrink_zone(int priority, struct zone *zone, { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int noswap = 0; - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - percent[0] = 0; - percent[1] = 100; - } else - get_scan_ratio(zone, sc, percent); + get_scan_count(zone, sc, nr, priority); - for_each_evictable_lru(l) { - int file = is_file_lru(l); - unsigned long scan; - - scan = zone_nr_lru_pages(zone, sc, l); - if (priority || noswap) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); - } + set_lumpy_reclaim_mode(priority, sc); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -1707,16 +1804,15 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; + bool all_unreclaimable = true; - sc->all_unreclaimable = 1; - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - sc->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1726,23 +1822,14 @@ static void shrink_zones(int priority, struct zonelist *zonelist, if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - sc->all_unreclaimable = 0; - } else { - /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. - */ - sc->all_unreclaimable = 0; - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); } shrink_zone(priority, zone, sc); + all_unreclaimable = false; } + return all_unreclaimable; } /* @@ -1765,42 +1852,38 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - unsigned long ret = 0; + bool all_unreclaimable; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); - /* - * mem_cgroup will not do shrink_slab. - */ - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - } - } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) disable_swap_token(); - shrink_zones(priority, zonelist, sc); + all_unreclaimable = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ if (scanning_global_lru(sc)) { + unsigned long lru_pages = 0; + for_each_zone_zonelist(zone, z, zonelist, + gfp_zone(sc->gfp_mask)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + lru_pages += zone_reclaimable_pages(zone); + } + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1808,10 +1891,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->nr_to_reclaim) { - ret = sc->nr_reclaimed; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; - } /* * Try to write back as many pages as we just scanned. This @@ -1831,9 +1912,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scanning_global_lru(sc)) - ret = sc->nr_reclaimed; + out: /* * Now that we've scanned all the zones at this priority level, note @@ -1845,25 +1924,23 @@ out: if (priority < 0) priority = 0; - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + delayacct_freepages_end(); + put_mems_allowed(); - zone->prev_priority = priority; - } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + if (sc->nr_reclaimed) + return sc->nr_reclaimed; - delayacct_freepages_end(); + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (scanning_global_lru(sc) && !all_unreclaimable) + return 1; - return ret; + return 0; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { + unsigned long nr_reclaimed; struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, @@ -1873,11 +1950,18 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, .nodemask = nodemask, }; - return do_try_to_free_pages(zonelist, &sc); + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #ifdef CONFIG_CGROUP_MEM_RES_CTLR @@ -1885,24 +1969,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness, - struct zone *zone, int nid) + struct zone *zone) { struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, - .isolate_pages = mem_cgroup_isolate_pages, }; - nodemask_t nm = nodemask_of_node(nid); - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - sc.nodemask = &nm; - sc.nr_reclaimed = 0; - sc.nr_scanned = 0; + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -1911,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed; } @@ -1920,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, unsigned int swappiness) { struct zonelist *zonelist; + unsigned long nr_reclaimed; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1928,14 +2016,22 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, - .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; - return do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #endif @@ -2006,24 +2102,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to - * free_pages == high_wmark_pages(zone). - */ - int temp_priority[MAX_NR_ZONES]; - loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -2083,7 +2168,6 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; - int nid, zid; if (!populated_zone(zone)) continue; @@ -2091,18 +2175,14 @@ loop_again: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); - nid = pgdat->node_id; - zid = zone_idx(zone); /* * Call soft limit reclaim before calling shrink_zone. * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, - nid, zid); + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -2166,16 +2246,6 @@ loop_again: break; } out: - /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. - */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = temp_priority[i]; - } if (!all_zones_ok) { cond_resched(); @@ -2279,9 +2349,10 @@ static int kswapd(void *p) * premature sleep. If not, then go fully * to sleep until explicitly woken up */ - if (!sleeping_prematurely(pgdat, order, remaining)) + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); schedule(); - else { + } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else @@ -2301,8 +2372,10 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); balance_pgdat(pgdat, order); + } } return 0; } @@ -2322,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) @@ -2385,7 +2459,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .hibernation_mode = 1, .swappiness = vm_swappiness, .order = 0, - .isolate_pages = isolate_pages_global, }; struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -2570,11 +2643,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, - .isolate_pages = isolate_pages_global, }; - unsigned long slab_reclaimable; + unsigned long nr_slab_pages0, nr_slab_pages1; - disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -2593,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } - slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (slab_reclaimable > zone->min_slab_pages) { + nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages0 > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current @@ -2611,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) > - slab_reclaimable - nr_pages) - ; + for (;;) { + unsigned long lru_pages = zone_reclaimable_pages(zone); + + /* No reclaimable slab or very low memory pressure */ + if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + break; + + /* Freed enough memory */ + nr_slab_pages1 = zone_page_state(zone, + NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) + break; + } /* * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - sc.nr_reclaimed += slab_reclaimable - - zone_page_state(zone, NR_SLAB_RECLAIMABLE); + nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 < nr_slab_pages0) + sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; } p->reclaim_state = NULL; |