diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 388 |
1 files changed, 279 insertions, 109 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index f6b435c8007..7ef69124fa3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -42,6 +42,7 @@ #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/oom.h> +#include <linux/prefetch.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -94,8 +95,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - int swappiness; - int order; /* @@ -106,6 +105,7 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -172,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); + return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, + zone_to_nid(zone), zone_idx(zone), BIT(lru)); return zone_page_state(zone, NR_LRU_BASE + lru); } @@ -201,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker) } EXPORT_SYMBOL(unregister_shrinker); +static inline int do_shrinker_shrink(struct shrinker *shrinker, + struct shrink_control *sc, + unsigned long nr_to_scan) +{ + sc->nr_to_scan = nr_to_scan; + return (*shrinker->shrink)(shrinker, sc); +} + #define SHRINK_BATCH 128 /* * Call the shrink functions to age shrinkable caches @@ -221,67 +230,114 @@ EXPORT_SYMBOL(unregister_shrinker); * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) +unsigned long shrink_slab(struct shrink_control *shrink, + unsigned long nr_pages_scanned, + unsigned long lru_pages) { struct shrinker *shrinker; unsigned long ret = 0; - if (scanned == 0) - scanned = SWAP_CLUSTER_MAX; + if (nr_pages_scanned == 0) + nr_pages_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) - return 1; /* Assume we'll be able to shrink next time */ + if (!down_read_trylock(&shrinker_rwsem)) { + /* Assume we'll be able to shrink next time */ + ret = 1; + goto out; + } list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; - max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); - delta = (4 * scanned) / shrinker->seeks; + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + + total_scan = nr; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - total_scan = shrinker->nr; - shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; + while (total_scan >= batch_size) { int nr_before; - nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); - shrink_ret = (*shrinker->shrink)(shrinker, this_scan, - gfp_mask); + nr_before = do_shrinker_shrink(shrinker, shrink, 0); + shrink_ret = do_shrinker_shrink(shrinker, shrink, + batch_size); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } - shrinker->nr += total_scan; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); +out: + cond_resched(); return ret; } @@ -937,7 +993,7 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty == nr_congested && nr_dirty != 0) + if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) zone_set_flag(zone, ZONE_CONGESTED); free_page_list(&free_pages); @@ -1109,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_dirty++; scan++; } else { - /* the page is freed already. */ - if (!page_count(cursor_page)) + /* + * Check if the page is freed already. + * + * We can't use page_count() as that + * requires compound_head and we don't + * have a pin on the page here. If a + * page is tail, we may or may not + * have isolated the head, so assume + * it's not free, it'd be tricky to + * track the head status without a + * page pin. + */ + if (!PageTail(cursor_page) && + !atomic_read(&cursor_page->_count)) continue; break; } @@ -1201,13 +1269,16 @@ int isolate_lru_page(struct page *page) { int ret = -EBUSY; + VM_BUG_ON(!page_count(page)); + if (PageLRU(page)) { struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { + if (PageLRU(page)) { int lru = page_lru(page); ret = 0; + get_page(page); ClearPageLRU(page); del_page_from_lru_list(zone, page, lru); @@ -1278,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1321,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; + if (!scanning_global_lru(sc)) { + sc->memcg_record->nr_scanned[0] += *nr_anon; + sc->memcg_record->nr_scanned[1] += *nr_file; + } } /* @@ -1434,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_freed[file] += nr_reclaimed; + local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1533,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1584,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -1699,24 +1783,11 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) +static int vmscan_swappiness(struct scan_control *sc) { - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; - else - nr = 0; - - return nr; + if (scanning_global_lru(sc)) + return vm_swappiness; + return mem_cgroup_swappiness(sc->mem_cgroup); } /* @@ -1737,6 +1808,23 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; + int force_scan = 0; + unsigned long nr_force_scan[2]; + + + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + + if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { + /* kswapd does zone balancing and need to scan this zone */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = 1; + /* memcg may have small limit and need to avoid priority drop */ + if (!scanning_global_lru(sc)) + force_scan = 1; + } /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1744,14 +1832,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; + nr_force_scan[0] = 0; + nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, @@ -1760,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; + nr_force_scan[0] = SWAP_CLUSTER_MAX; + nr_force_scan[1] = 0; goto out; } } @@ -1768,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; + anon_prio = vmscan_swappiness(sc); + file_prio = 200 - vmscan_swappiness(sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1808,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; + if (force_scan) { + unsigned long scan = SWAP_CLUSTER_MAX; + nr_force_scan[0] = div64_u64(scan * ap, denominator); + nr_force_scan[1] = div64_u64(scan * fp, denominator); + } out: for_each_evictable_lru(l) { int file = is_file_lru(l); @@ -1818,8 +1910,19 @@ out: scan >>= priority; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); + + /* + * If zone is small or memcg is small, nr[l] can be 0. + * This results no-scan on this priority and priority drop down. + * For global direct reclaim, it can visit next zone and tend + * not to have problems. For global kswapd, it's for zone + * balancing and it need to scan a small amounts. When using + * memcg, priority drop can cause big latency. So, it's better + * to scan small amount. See may_noscan above. + */ + if (!scan && force_scan) + scan = nr_force_scan[file]; + nr[l] = scan; } } @@ -1964,6 +2067,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1978,6 +2083,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + /* need some check for avoid more shrink_zone() */ } shrink_zone(priority, zone, sc); @@ -2026,7 +2144,8 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc) + struct scan_control *sc, + struct shrink_control *shrink) { int priority; unsigned long total_scanned = 0; @@ -2044,7 +2163,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) - disable_swap_token(); + disable_swap_token(sc->mem_cgroup); shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from @@ -2060,7 +2179,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, lru_pages += zone_reclaimable_pages(zone); } - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); + shrink_slab(shrink, sc->nr_scanned, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -2127,17 +2246,19 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .nodemask = nodemask, }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2147,19 +2268,23 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - unsigned int swappiness, - struct zone *zone) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + struct memcg_scanrecord *rec, + unsigned long *scanned) { struct scan_control sc = { + .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem, + .memcg_record = rec, }; + unsigned long start, end; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2167,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); + start = sched_clock(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2175,6 +2301,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + end = sched_clock(); + + if (rec) + rec->elapsed += end - start; + *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2184,30 +2315,46 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, - unsigned int swappiness) + struct memcg_scanrecord *rec) { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long start, end; + int nid; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, + .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ + .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, }; - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - zonelist = NODE_DATA(numa_node_id())->node_zonelists; + start = sched_clock(); + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(mem_cont); + + zonelist = NODE_DATA(nid)->node_zonelists; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + end = sched_clock(); + if (rec) + rec->elapsed += end - start; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2240,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, for (i = 0; i <= classzone_idx; i++) present_pages += pgdat->node_zones[i].present_pages; - return balanced_pages > (present_pages >> 2); + /* A special case here: if zone has no page, we think it's balanced */ + return balanced_pages >= (present_pages >> 2); } /* is kswapd sleeping prematurely? */ @@ -2256,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, return true; /* Check the watermark levels */ - for (i = 0; i < pgdat->nr_zones; i++) { + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) @@ -2274,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - classzone_idx, 0)) + i, 0)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2286,7 +2434,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, * must be balanced */ if (order) - return pgdat_balanced(pgdat, balanced, classzone_idx); + return !pgdat_balanced(pgdat, balanced, classzone_idx); else return !all_zones_ok; } @@ -2322,6 +2470,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, @@ -2331,10 +2481,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * we want to put equal scanning pressure on each zone. */ .nr_to_reclaim = ULONG_MAX, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; loop_again: total_scanned = 0; sc.nr_reclaimed = 0; @@ -2347,7 +2499,7 @@ loop_again: /* The swap token gets in the way of swapout... */ if (!priority) - disable_swap_token(); + disable_swap_token(NULL); all_zones_ok = 1; balanced = 0; @@ -2376,7 +2528,6 @@ loop_again: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; - *classzone_idx = i; break; } } @@ -2411,11 +2562,15 @@ loop_again: sc.nr_scanned = 0; + nr_soft_scanned = 0; /* * Call soft limit reclaim before calling shrink_zone. - * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + total_scanned += nr_soft_scanned; /* * We put equal pressure on every zone, unless @@ -2431,19 +2586,18 @@ loop_again: KSWAPD_ZONE_BALANCE_GAP_RATIO); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + balance_gap, - end_zone, 0)) + end_zone, 0)) { shrink_zone(priority, zone, &sc); - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - if (zone->all_unreclaimable) - continue; - if (nr_slab == 0 && - !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_scanned += sc.nr_scanned; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; + } + /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -2453,6 +2607,12 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + if (zone->all_unreclaimable) { + if (end_zone && end_zone == i) + end_zone--; + continue; + } + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; @@ -2631,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ static int kswapd(void *p) { - unsigned long order; - int classzone_idx; + unsigned long order, new_order; + int classzone_idx, new_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2662,17 +2822,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = 0; - classzone_idx = MAX_NR_ZONES - 1; + order = new_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; for ( ; ; ) { - unsigned long new_order; - int new_classzone_idx; int ret; - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (classzone_idx >= new_classzone_idx && order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' @@ -2685,7 +2851,7 @@ static int kswapd(void *p) order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + pgdat->classzone_idx = pgdat->nr_zones - 1; } ret = try_to_freeze(); @@ -2784,10 +2950,12 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, - .swappiness = vm_swappiness, .order = 0, }; - struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; @@ -2796,7 +2964,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2968,9 +3136,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, .order = order, }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; unsigned long nr_slab_pages0, nr_slab_pages1; cond_resched(); @@ -3012,7 +3182,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) unsigned long lru_pages = zone_reclaimable_pages(zone); /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) break; /* Freed enough memory */ |