summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c236
1 files changed, 163 insertions, 73 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3f56c8deb3c..e01ded36544 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -11,6 +11,8 @@
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
@@ -43,6 +45,7 @@
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -83,6 +86,9 @@ struct scan_control {
/* Scan (total_size >> priority) pages at once */
int priority;
+ /* anon vs. file LRUs scanning "ratio" */
+ int swappiness;
+
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
@@ -324,7 +330,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
- trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
return freed;
}
@@ -477,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- printk("%s: orphaned page\n", __func__);
+ pr_info("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -1121,7 +1127,7 @@ keep:
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
- free_hot_cold_page_list(&free_pages, 1);
+ free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1439,6 +1445,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
}
/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested. In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+ return !(current->flags & PF_LESS_THROTTLE) ||
+ current->backing_dev_info == NULL ||
+ bdi_write_congested(current->backing_dev_info);
+}
+
+/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
@@ -1519,7 +1538,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, 1);
+ free_hot_cold_page_list(&page_list, true);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1554,19 +1573,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context. It will forcibly stall in the
- * next check.
+ * pages from reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
/*
- * In addition, if kswapd scans pages marked marked for
- * immediate reclaim and under writeback (nr_immediate), it
- * implies that pages are cycling through the LRU faster than
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it implies
+ * that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_unqueued_dirty == nr_taken || nr_immediate)
+ if (nr_immediate && current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
@@ -1575,7 +1593,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* is congested. Allow kswapd to continue until it starts encountering
* unqueued dirty pages or cycling through the LRU too quickly.
*/
- if (!sc->hibernation_mode && !current_is_kswapd())
+ if (!sc->hibernation_mode && !current_is_kswapd() &&
+ current_may_throttle())
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1740,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, 1);
+ free_hot_cold_page_list(&l_hold, true);
}
#ifdef CONFIG_SWAP
@@ -1830,13 +1849,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
-static int vmscan_swappiness(struct scan_control *sc)
-{
- if (global_reclaim(sc))
- return vm_swappiness;
- return mem_cgroup_swappiness(sc->target_mem_cgroup);
-}
-
enum scan_balance {
SCAN_EQUAL,
SCAN_FRACT,
@@ -1866,6 +1878,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
+ bool some_scanned;
+ int pass;
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1895,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* using the memory controller's swap limit feature would be
* too expensive.
*/
- if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+ if (!global_reclaim(sc) && !sc->swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -1905,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* system is close to OOM, scan both anon and file equally
* (unless the swappiness setting disagrees with swapping).
*/
- if (!sc->priority && vmscan_swappiness(sc)) {
+ if (!sc->priority && sc->swappiness) {
scan_balance = SCAN_EQUAL;
goto out;
}
@@ -1916,6 +1930,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
get_lru_size(lruvec, LRU_INACTIVE_FILE);
/*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (global_reclaim(sc)) {
+ unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+
+ if (unlikely(file + free <= high_wmark_pages(zone))) {
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+ }
+
+ /*
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
@@ -1930,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = vmscan_swappiness(sc);
+ anon_prio = sc->swappiness;
file_prio = 200 - anon_prio;
/*
@@ -1971,39 +2003,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
+ some_scanned = false;
+ /* Only use force_scan on second pass. */
+ for (pass = 0; !some_scanned && pass < 2; pass++) {
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- size = get_lru_size(lruvec, lru);
- scan = size >> sc->priority;
+ size = get_lru_size(lruvec, lru);
+ scan = size >> sc->priority;
- if (!scan && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
+ if (!scan && pass && force_scan)
+ scan = min(size, SWAP_CLUSTER_MAX);
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ */
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
+ }
+ nr[lru] = scan;
/*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
+ * Skip the second pass and don't force_scan,
+ * if we found something to scan.
*/
- scan = div64_u64(scan * fraction[file], denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
- scan = 0;
- break;
- default:
- /* Look ma, no brain */
- BUG();
+ some_scanned |= !!scan;
}
- nr[lru] = scan;
}
}
@@ -2019,13 +2061,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
- bool scan_adjusted = false;
+ bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
+ /*
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -2046,17 +2102,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
continue;
/*
- * For global direct reclaim, reclaim only the number of pages
- * requested. Less care is taken to scan proportionally as it
- * is more important to minimise direct reclaim stall latency
- * than it is to properly age the LRU lists.
- */
- if (global_reclaim(sc) && !current_is_kswapd())
- break;
-
- /*
* For kswapd and memcg, reclaim at least the number of pages
- * requested. Ensure that the anon and file LRUs shrink
+ * requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
@@ -2064,6 +2111,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
@@ -2206,6 +2262,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ sc->swappiness = mem_cgroup_swappiness(memcg);
shrink_lruvec(lruvec, sc);
/*
@@ -2250,9 +2307,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
- balance_gap = min(low_wmark_pages(zone),
- (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2507,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2535,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2556,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -2642,6 +2729,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_swap = !noswap,
.order = 0,
.priority = 0,
+ .swappiness = mem_cgroup_swappiness(memcg),
.target_mem_cgroup = memcg,
};
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
@@ -2873,9 +2961,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
* high wmark plus a "gap" where the gap is either the low
* watermark or 1% of the zone, whichever is smaller.
*/
- balance_gap = min(low_wmark_pages(zone),
- (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
/*
* If there is no low memory pressure or the zone is balanced then no
@@ -3284,7 +3371,10 @@ static int kswapd(void *p)
}
}
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}
@@ -3404,7 +3494,7 @@ int kswapd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
*/
void kswapd_stop(int nid)
{