diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 397 |
1 files changed, 213 insertions, 184 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 196709f5ee5..88c5fed8b9a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -128,7 +128,7 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -long vm_total_pages; /* The total number of pages which the VM controls */ +unsigned long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) } #endif -static int inactive_file_is_low_global(struct zone *zone) -{ - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_FILE); - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - - return (active > inactive); -} - /** * inactive_file_is_low - check if file pages need to be deactivated * @lruvec: LRU vector to check @@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone) */ static int inactive_file_is_low(struct lruvec *lruvec) { - if (!mem_cgroup_disabled()) - return mem_cgroup_inactive_file_is_low(lruvec); + unsigned long inactive; + unsigned long active; + + inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = get_lru_size(lruvec, LRU_ACTIVE_FILE); - return inactive_file_is_low_global(lruvec_zone(lruvec)); + return active > inactive; } static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) @@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc) return mem_cgroup_swappiness(sc->target_mem_cgroup); } +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc) static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { - unsigned long anon, file, free; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + u64 fraction[2]; + u64 denominator = 0; /* gcc */ + struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; + unsigned long anon, file, free; + bool force_scan = false; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - u64 fraction[2], denominator; enum lru_list lru; - int noswap = 0; - bool force_scan = false; - struct zone *zone = lruvec_zone(lruvec); /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - fraction[0] = 0; - fraction[1] = 1; - denominator = 1; + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && vmscan_swappiness(sc)) { + scan_balance = SCAN_EQUAL; goto out; } @@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + get_lru_size(lruvec, LRU_INACTIVE_FILE); + /* + * If it's foreseeable that reclaiming the file cache won't be + * enough to get the zone back into a desirable shape, we have + * to swap. Better start now and leave the - probably heavily + * thrashing - remaining file pages alone. + */ if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + free = zone_page_state(zone, NR_FREE_PAGES); if (unlikely(file + free <= high_wmark_pages(zone))) { - /* - * If we have very few page cache pages, force-scan - * anon pages. - */ - fraction[0] = 1; - fraction[1] = 0; - denominator = 1; - goto out; - } else if (!inactive_file_is_low_global(zone)) { - /* - * There is enough inactive page cache, do not - * reclaim anything from the working set right now. - */ - fraction[0] = 0; - fraction[1] = 1; - denominator = 1; + scan_balance = SCAN_ANON; goto out; } } /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ + if (!inactive_file_is_low(lruvec)) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ @@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); + unsigned long size; unsigned long scan; - scan = get_lru_size(lruvec, lru); - if (sc->priority || noswap || !vmscan_swappiness(sc)) { - scan >>= sc->priority; - if (!scan && force_scan) - scan = SWAP_CLUSTER_MAX; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ scan = div64_u64(scan * fraction[file], denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); } nr[lru] = scan; } } +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + + get_scan_count(lruvec, sc, nr); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + /* + * On large memory systems, scan >> priority can become + * really large. This is fine for the starting priority; + * we want to put equal scanning pressure on each zone. + * However, if the VM has a harder time of freeing pages, + * with multiple processes reclaiming pages, the total + * freeing target can get unreasonably large. + */ + if (nr_reclaimed >= nr_to_reclaim && + sc->priority < DEF_PRIORITY) + break; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + throttle_vm_writeout(sc->gfp_mask); +} + /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { @@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_zone() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct lruvec *lruvec, +static inline bool should_continue_reclaim(struct zone *zone, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); - if (nr_swap_pages > 0) - inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); + inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { + switch (compaction_suitable(zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, } } -/* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long nr[NR_LRU_LISTS]; - unsigned long nr_to_scan; - enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; - unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct blk_plug plug; - -restart: - nr_reclaimed = 0; - nr_scanned = sc->nr_scanned; - get_scan_count(lruvec, sc, nr); - - blk_start_plug(&plug); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || - nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(lru) { - if (nr[lru]) { - nr_to_scan = min_t(unsigned long, - nr[lru], SWAP_CLUSTER_MAX); - nr[lru] -= nr_to_scan; - - nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); - } - } - /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. - */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) - break; - } - blk_finish_plug(&plug); - sc->nr_reclaimed += nr_reclaimed; - /* - * Even if we did not try to evict anon pages at all, we want to - * rebalance the anon lru active/inactive ratio. - */ - if (inactive_anon_is_low(lruvec)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); - - /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(lruvec, nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)) - goto restart; + do { + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = sc->priority, + }; + struct mem_cgroup *memcg; - throttle_vm_writeout(sc->gfp_mask); -} + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = sc->priority, - }; - struct mem_cgroup *memcg; + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct lruvec *lruvec; - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + lruvec = mem_cgroup_zone_lruvec(zone, memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, sc); - /* - * Limit reclaim has historically picked one memcg and - * scanned it with decreasing priority levels until - * nr_to_reclaim had been reclaimed. This priority - * cycle is thus over after a single memcg. - * - * Direct reclaim and kswapd, on the other hand, have - * to scan all memory cgroups to fulfill the overall - * scan target for the zone. - */ - if (!global_reclaim(sc)) { - mem_cgroup_iter_break(root, memcg); - break; - } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + /* + * Direct reclaim and kswapd have to scan all memory + * cgroups to fulfill the overall scan target for the + * zone. + * + * Limit reclaim, on the other hand, only cares about + * nr_to_reclaim pages to be reclaimed and it will + * retry with decreasing priority if one round over the + * whole hierarchy is not sufficient. + */ + if (!global_reclaim(sc) && + sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)); } /* Returns true if compaction should go ahead for a high-order request */ @@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * a reasonable chance of completing and allocating the page */ balance_gap = min(low_wmark_pages(zone), - (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, goto out; /* + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. + */ + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; + + /* * Try to write back as many pages as we just scanned. This * tends to cause slow streaming writers to write data to the * disk smoothly, at the dirtying rate, which is nice. But @@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, @@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order, */ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long present_pages = 0; + unsigned long managed_pages = 0; unsigned long balanced_pages = 0; int i; @@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) if (!populated_zone(zone)) continue; - present_pages += zone->present_pages; + managed_pages += zone->managed_pages; /* * A special case here: @@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * they must be considered balanced here as well! */ if (zone->all_unreclaimable) { - balanced_pages += zone->present_pages; + balanced_pages += zone->managed_pages; continue; } if (zone_balanced(zone, order, 0, i)) - balanced_pages += zone->present_pages; + balanced_pages += zone->managed_pages; else if (!order) return false; } if (order) - return balanced_pages >= (present_pages >> 2); + return balanced_pages >= (managed_pages >> 2); else return true; } @@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - struct zone *unbalanced_zone; + bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; @@ -2595,9 +2647,6 @@ loop_again: do { unsigned long lru_pages = 0; - int has_under_min_watermark_zone = 0; - - unbalanced_zone = NULL; /* * Scan in the highmem->dma direction for the highest @@ -2638,8 +2687,11 @@ loop_again: zone_clear_flag(zone, ZONE_CONGESTED); } } - if (i < 0) + + if (i < 0) { + pgdat_is_balanced = true; goto out; + } for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2689,7 +2741,7 @@ loop_again: * of the zone, whichever is smaller. */ balance_gap = min(low_wmark_pages(zone), - (zone->present_pages + + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); /* @@ -2720,12 +2772,10 @@ loop_again: } /* - * If we've done a decent amount of scanning and - * the reclaim ratio is low, start doing writepage - * even in laptop mode + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. */ - if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; if (zone->all_unreclaimable) { @@ -2734,17 +2784,7 @@ loop_again: continue; } - if (!zone_balanced(zone, testorder, 0, end_zone)) { - unbalanced_zone = zone; - /* - * We are still under min water mark. This - * means that we have a GFP_ATOMIC allocation - * failure risk. Hurry up! - */ - if (!zone_watermark_ok_safe(zone, order, - min_wmark_pages(zone), end_zone, 0)) - has_under_min_watermark_zone = 1; - } else { + if (zone_balanced(zone, testorder, 0, end_zone)) /* * If a zone reaches its high watermark, * consider it to be no longer congested. It's @@ -2753,8 +2793,6 @@ loop_again: * speculatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); - } - } /* @@ -2766,17 +2804,9 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (pgdat_balanced(pgdat, order, *classzone_idx)) + if (pgdat_balanced(pgdat, order, *classzone_idx)) { + pgdat_is_balanced = true; break; /* kswapd: all done */ - /* - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. - */ - if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { - if (has_under_min_watermark_zone) - count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); - else if (unbalanced_zone) - wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); } /* @@ -2788,9 +2818,9 @@ loop_again: if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; } while (--sc.priority >= 0); -out: - if (!pgdat_balanced(pgdat, order, *classzone_idx)) { +out: + if (!pgdat_is_balanced) { cond_resched(); try_to_freeze(); @@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void) nr = global_page_state(NR_ACTIVE_FILE) + global_page_state(NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) nr += global_page_state(NR_ACTIVE_ANON) + global_page_state(NR_INACTIVE_ANON); @@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) nr += zone_page_state(zone, NR_ACTIVE_ANON) + zone_page_state(zone, NR_INACTIVE_ANON); @@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .nr_to_reclaim = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, }; |