summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c388
1 files changed, 279 insertions, 109 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6b435c8007..7ef69124fa3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,6 +42,7 @@
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -94,8 +95,6 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
int may_swap;
- int swappiness;
-
int order;
/*
@@ -106,6 +105,7 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
+ struct memcg_scanrecord *memcg_record;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -172,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
struct scan_control *sc, enum lru_list lru)
{
if (!scanning_global_lru(sc))
- return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+ return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
+ zone_to_nid(zone), zone_idx(zone), BIT(lru));
return zone_page_state(zone, NR_LRU_BASE + lru);
}
@@ -201,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+ struct shrink_control *sc,
+ unsigned long nr_to_scan)
+{
+ sc->nr_to_scan = nr_to_scan;
+ return (*shrinker->shrink)(shrinker, sc);
+}
+
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
@@ -221,67 +230,114 @@ EXPORT_SYMBOL(unregister_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+unsigned long shrink_slab(struct shrink_control *shrink,
+ unsigned long nr_pages_scanned,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long ret = 0;
- if (scanned == 0)
- scanned = SWAP_CLUSTER_MAX;
+ if (nr_pages_scanned == 0)
+ nr_pages_scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
+ if (!down_read_trylock(&shrinker_rwsem)) {
+ /* Assume we'll be able to shrink next time */
+ ret = 1;
+ goto out;
+ }
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass;
+ int shrink_ret = 0;
+ long nr;
+ long new_nr;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
- max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- delta = (4 * scanned) / shrinker->seeks;
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ do {
+ nr = shrinker->nr;
+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+ total_scan = nr;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0) {
+ total_scan += delta;
+ if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
- shrinker->shrink, shrinker->nr);
- shrinker->nr = max_pass;
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
}
/*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (shrinker->nr > max_pass * 2)
- shrinker->nr = max_pass * 2;
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
- while (total_scan >= SHRINK_BATCH) {
- long this_scan = SHRINK_BATCH;
- int shrink_ret;
+ while (total_scan >= batch_size) {
int nr_before;
- nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
- gfp_mask);
+ nr_before = do_shrinker_shrink(shrinker, shrink, 0);
+ shrink_ret = do_shrinker_shrink(shrinker, shrink,
+ batch_size);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, this_scan);
- total_scan -= this_scan;
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
cond_resched();
}
- shrinker->nr += total_scan;
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ do {
+ nr = shrinker->nr;
+ new_nr = total_scan + nr;
+ if (total_scan <= 0)
+ break;
+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
+out:
+ cond_resched();
return ret;
}
@@ -937,7 +993,7 @@ keep_lumpy:
* back off and wait for congestion to clear because further reclaim
* will encounter the same problem
*/
- if (nr_dirty == nr_congested && nr_dirty != 0)
+ if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
zone_set_flag(zone, ZONE_CONGESTED);
free_page_list(&free_pages);
@@ -1109,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
nr_lumpy_dirty++;
scan++;
} else {
- /* the page is freed already. */
- if (!page_count(cursor_page))
+ /*
+ * Check if the page is freed already.
+ *
+ * We can't use page_count() as that
+ * requires compound_head and we don't
+ * have a pin on the page here. If a
+ * page is tail, we may or may not
+ * have isolated the head, so assume
+ * it's not free, it'd be tricky to
+ * track the head status without a
+ * page pin.
+ */
+ if (!PageTail(cursor_page) &&
+ !atomic_read(&cursor_page->_count))
continue;
break;
}
@@ -1201,13 +1269,16 @@ int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
+ VM_BUG_ON(!page_count(page));
+
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && get_page_unless_zero(page)) {
+ if (PageLRU(page)) {
int lru = page_lru(page);
ret = 0;
+ get_page(page);
ClearPageLRU(page);
del_page_from_lru_list(zone, page, lru);
@@ -1278,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1321,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
reclaim_stat->recent_scanned[0] += *nr_anon;
reclaim_stat->recent_scanned[1] += *nr_file;
+ if (!scanning_global_lru(sc)) {
+ sc->memcg_record->nr_scanned[0] += *nr_anon;
+ sc->memcg_record->nr_scanned[1] += *nr_file;
+ }
}
/*
@@ -1434,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_freed[file] += nr_reclaimed;
+
local_irq_disable();
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1533,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
reclaim_stat->recent_scanned[file] += nr_taken;
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
if (file)
@@ -1584,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
@@ -1699,24 +1783,11 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
}
-/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan)
+static int vmscan_swappiness(struct scan_control *sc)
{
- unsigned long nr;
-
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
-
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
- else
- nr = 0;
-
- return nr;
+ if (scanning_global_lru(sc))
+ return vm_swappiness;
+ return mem_cgroup_swappiness(sc->mem_cgroup);
}
/*
@@ -1737,6 +1808,23 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
+ int force_scan = 0;
+ unsigned long nr_force_scan[2];
+
+
+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
+ if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+ /* kswapd does zone balancing and need to scan this zone */
+ if (scanning_global_lru(sc) && current_is_kswapd())
+ force_scan = 1;
+ /* memcg may have small limit and need to avoid priority drop */
+ if (!scanning_global_lru(sc))
+ force_scan = 1;
+ }
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1744,14 +1832,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
+ nr_force_scan[0] = 0;
+ nr_force_scan[1] = SWAP_CLUSTER_MAX;
goto out;
}
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
@@ -1760,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
+ nr_force_scan[0] = SWAP_CLUSTER_MAX;
+ nr_force_scan[1] = 0;
goto out;
}
}
@@ -1768,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = sc->swappiness;
- file_prio = 200 - sc->swappiness;
+ anon_prio = vmscan_swappiness(sc);
+ file_prio = 200 - vmscan_swappiness(sc);
/*
* OK, so we have swap space and a fair amount of page cache
@@ -1808,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
+ if (force_scan) {
+ unsigned long scan = SWAP_CLUSTER_MAX;
+ nr_force_scan[0] = div64_u64(scan * ap, denominator);
+ nr_force_scan[1] = div64_u64(scan * fp, denominator);
+ }
out:
for_each_evictable_lru(l) {
int file = is_file_lru(l);
@@ -1818,8 +1910,19 @@ out:
scan >>= priority;
scan = div64_u64(scan * fraction[file], denominator);
}
- nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l]);
+
+ /*
+ * If zone is small or memcg is small, nr[l] can be 0.
+ * This results no-scan on this priority and priority drop down.
+ * For global direct reclaim, it can visit next zone and tend
+ * not to have problems. For global kswapd, it's for zone
+ * balancing and it need to scan a small amounts. When using
+ * memcg, priority drop can cause big latency. So, it's better
+ * to scan small amount. See may_noscan above.
+ */
+ if (!scan && force_scan)
+ scan = nr_force_scan[file];
+ nr[l] = scan;
}
}
@@ -1964,6 +2067,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
{
struct zoneref *z;
struct zone *zone;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1978,6 +2083,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
+ * scanned pages. This works for global memory pressure
+ * and balancing, not for a memcg's limit.
+ */
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ sc->nr_scanned += nr_soft_scanned;
+ /* need some check for avoid more shrink_zone() */
}
shrink_zone(priority, zone, sc);
@@ -2026,7 +2144,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc)
+ struct scan_control *sc,
+ struct shrink_control *shrink)
{
int priority;
unsigned long total_scanned = 0;
@@ -2044,7 +2163,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
if (!priority)
- disable_swap_token();
+ disable_swap_token(sc->mem_cgroup);
shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
@@ -2060,7 +2179,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
lru_pages += zone_reclaimable_pages(zone);
}
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ shrink_slab(shrink, sc->nr_scanned, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -2127,17 +2246,19 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
.may_swap = 1,
- .swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
.nodemask = nodemask,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2147,19 +2268,23 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
- gfp_t gfp_mask, bool noswap,
- unsigned int swappiness,
- struct zone *zone)
+ gfp_t gfp_mask, bool noswap,
+ struct zone *zone,
+ struct memcg_scanrecord *rec,
+ unsigned long *scanned)
{
struct scan_control sc = {
+ .nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
- .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
+ .memcg_record = rec,
};
+ unsigned long start, end;
+
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2167,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
sc.may_writepage,
sc.gfp_mask);
+ start = sched_clock();
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -2175,6 +2301,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
+ end = sched_clock();
+
+ if (rec)
+ rec->elapsed += end - start;
+ *scanned = sc.nr_scanned;
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2184,30 +2315,46 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
bool noswap,
- unsigned int swappiness)
+ struct memcg_scanrecord *rec)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long start, end;
+ int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
+ .memcg_record = rec,
.nodemask = NULL, /* we don't care the placement */
+ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ };
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
};
- sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
- (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ start = sched_clock();
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+ * take care of from where we get pages. So the node where we start the
+ * scan does not need to be the current node.
+ */
+ nid = mem_cgroup_select_victim_node(mem_cont);
+
+ zonelist = NODE_DATA(nid)->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ end = sched_clock();
+ if (rec)
+ rec->elapsed += end - start;
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2240,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
for (i = 0; i <= classzone_idx; i++)
present_pages += pgdat->node_zones[i].present_pages;
- return balanced_pages > (present_pages >> 2);
+ /* A special case here: if zone has no page, we think it's balanced */
+ return balanced_pages >= (present_pages >> 2);
}
/* is kswapd sleeping prematurely? */
@@ -2256,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
return true;
/* Check the watermark levels */
- for (i = 0; i < pgdat->nr_zones; i++) {
+ for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
@@ -2274,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
}
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- classzone_idx, 0))
+ i, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
@@ -2286,7 +2434,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
* must be balanced
*/
if (order)
- return pgdat_balanced(pgdat, balanced, classzone_idx);
+ return !pgdat_balanced(pgdat, balanced, classzone_idx);
else
return !all_zones_ok;
}
@@ -2322,6 +2470,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
@@ -2331,10 +2481,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* we want to put equal scanning pressure on each zone.
*/
.nr_to_reclaim = ULONG_MAX,
- .swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
@@ -2347,7 +2499,7 @@ loop_again:
/* The swap token gets in the way of swapout... */
if (!priority)
- disable_swap_token();
+ disable_swap_token(NULL);
all_zones_ok = 1;
balanced = 0;
@@ -2376,7 +2528,6 @@ loop_again:
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
- *classzone_idx = i;
break;
}
}
@@ -2411,11 +2562,15 @@ loop_again:
sc.nr_scanned = 0;
+ nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
- * For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ order, sc.gfp_mask,
+ &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+ total_scanned += nr_soft_scanned;
/*
* We put equal pressure on every zone, unless
@@ -2431,19 +2586,18 @@ loop_again:
KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone) + balance_gap,
- end_zone, 0))
+ end_zone, 0)) {
shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 &&
- !zone_reclaimable(zone))
- zone->all_unreclaimable = 1;
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+
+ if (nr_slab == 0 && !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
+ }
+
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -2453,6 +2607,12 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
+ if (zone->all_unreclaimable) {
+ if (end_zone && end_zone == i)
+ end_zone--;
+ continue;
+ }
+
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
@@ -2631,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
static int kswapd(void *p)
{
- unsigned long order;
- int classzone_idx;
+ unsigned long order, new_order;
+ int classzone_idx, new_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2662,17 +2822,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- order = 0;
- classzone_idx = MAX_NR_ZONES - 1;
+ order = new_order = 0;
+ classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
for ( ; ; ) {
- unsigned long new_order;
- int new_classzone_idx;
int ret;
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ /*
+ * If the last balance_pgdat was unsuccessful it's unlikely a
+ * new request of a similar or harder type will succeed soon
+ * so consider going to sleep on the basis we reclaimed at
+ */
+ if (classzone_idx >= new_classzone_idx && order == new_order) {
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+ }
+
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
@@ -2685,7 +2851,7 @@ static int kswapd(void *p)
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
}
ret = try_to_freeze();
@@ -2784,10 +2950,12 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.may_writepage = 1,
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
- .swappiness = vm_swappiness,
.order = 0,
};
- struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
@@ -2796,7 +2964,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -2968,9 +3136,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.nr_to_reclaim = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
- .swappiness = vm_swappiness,
.order = order,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
@@ -3012,7 +3182,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
break;
/* Freed enough memory */