summaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c294
1 files changed, 225 insertions, 69 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895c71f..63807583d8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,8 +36,11 @@
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
+#include <linux/mm_inline.h>
#include <trace/events/writeback.h>
+#include "internal.h"
+
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
*/
@@ -241,9 +244,6 @@ static unsigned long global_dirtyable_memory(void)
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
- /* Subtract min_free_kbytes */
- x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
-
return x + 1; /* Ensure that we never return 0 */
}
@@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
}
/*
+ * setpoint - dirty 3
+ * f(dirty) := 1.0 + (----------------)
+ * limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit) = 0 => the hard limit
+ * (4) df/dx <= 0 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ * => fast response on large errors; small oscillation near setpoint
+ */
+static inline long long pos_ratio_polynom(unsigned long setpoint,
+ unsigned long dirty,
+ unsigned long limit)
+{
+ long long pos_ratio;
+ long x;
+
+ x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+ limit - setpoint + 1);
+ pos_ratio = x;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+
+ return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
+}
+
+/*
* Dirty position control.
*
* (o) global/bdi setpoints
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
/*
* global setpoint
*
- * setpoint - dirty 3
- * f(dirty) := 1.0 + (----------------)
- * limit - setpoint
+ * See comment for pos_ratio_polynom().
+ */
+ setpoint = (freerun + limit) / 2;
+ pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+
+ /*
+ * The strictlimit feature is a tool preventing mistrusted filesystems
+ * from growing a large number of dirty pages before throttling. For
+ * such filesystems balance_dirty_pages always checks bdi counters
+ * against bdi limits. Even if global "nr_dirty" is under "freerun".
+ * This is especially important for fuse which sets bdi->max_ratio to
+ * 1% by default. Without strictlimit feature, fuse writeback may
+ * consume arbitrary amount of RAM because it is accounted in
+ * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
*
- * it's a 3rd order polynomial that subjects to
+ * Here, in bdi_position_ratio(), we calculate pos_ratio based on
+ * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+ * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
+ * limits are set by default to 10% and 20% (background and throttle).
+ * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+ * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
+ * about ~6K pages (as the average of background and throttle bdi
+ * limits). The 3rd order polynomial will provide positive feedback if
+ * bdi_dirty is under bdi_setpoint and vice versa.
*
- * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
- * (2) f(setpoint) = 1.0 => the balance point
- * (3) f(limit) = 0 => the hard limit
- * (4) df/dx <= 0 => negative feedback control
- * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
- * => fast response on large errors; small oscillation near setpoint
+ * Note, that we cannot use global counters in these calculations
+ * because we want to throttle process writing to a strictlimit BDI
+ * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
+ * in the example above).
*/
- setpoint = (freerun + limit) / 2;
- x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
- limit - setpoint + 1);
- pos_ratio = x;
- pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
- pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
- pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ long long bdi_pos_ratio;
+ unsigned long bdi_bg_thresh;
+
+ if (bdi_dirty < 8)
+ return min_t(long long, pos_ratio * 2,
+ 2 << RATELIMIT_CALC_SHIFT);
+
+ if (bdi_dirty >= bdi_thresh)
+ return 0;
+
+ bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+ bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
+ bdi_bg_thresh);
+
+ if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
+ return 0;
+
+ bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
+ bdi_thresh);
+
+ /*
+ * Typically, for strictlimit case, bdi_setpoint << setpoint
+ * and pos_ratio >> bdi_pos_ratio. In the other words global
+ * state ("dirty") is not limiting factor and we have to
+ * make decision based on bdi counters. But there is an
+ * important case when global pos_ratio should get precedence:
+ * global limits are exceeded (e.g. due to activities on other
+ * BDIs) while given strictlimit BDI is below limit.
+ *
+ * "pos_ratio * bdi_pos_ratio" would work for the case above,
+ * but it would look too non-natural for the case of all
+ * activity in the system coming from a single strictlimit BDI
+ * with bdi->max_ratio == 100%.
+ *
+ * Note that min() below somewhat changes the dynamics of the
+ * control system. Normally, pos_ratio value can be well over 3
+ * (when globally we are at freerun and bdi is well below bdi
+ * setpoint). Now the maximum pos_ratio in the same situation
+ * is 2. We might want to tweak this if we observe the control
+ * system is too slow to adapt.
+ */
+ return min(pos_ratio, bdi_pos_ratio);
+ }
/*
* We have computed basic pos_ratio above based on global situation. If
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
* keep that period small to reduce time lags).
*/
step = 0;
+
+ /*
+ * For strictlimit case, calculations above were based on bdi counters
+ * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+ * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
+ * Hence, to calculate "step" properly, we have to use bdi_dirty as
+ * "dirty" and bdi_setpoint as "setpoint".
+ *
+ * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
+ * it's possible that bdi_thresh is close to zero due to inactivity
+ * of backing device (see the implementation of bdi_dirty_limit()).
+ */
+ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ dirty = bdi_dirty;
+ if (bdi_dirty < 8)
+ setpoint = bdi_dirty + 1;
+ else
+ setpoint = (bdi_thresh +
+ bdi_dirty_limit(bdi, bg_thresh)) / 2;
+ }
+
if (dirty < setpoint) {
x = min(bdi->balanced_dirty_ratelimit,
min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1104,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
{
- long bw = bdi->avg_write_bandwidth;
- long t;
+ unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
@@ -1120,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
- return min_t(long, t, MAX_PAUSE);
+ return min_t(unsigned long, t, MAX_PAUSE);
}
static long bdi_min_pause(struct backing_dev_info *bdi,
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
+static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
+ unsigned long dirty_thresh,
+ unsigned long background_thresh,
+ unsigned long *bdi_dirty,
+ unsigned long *bdi_thresh,
+ unsigned long *bdi_bg_thresh)
+{
+ unsigned long bdi_reclaimable;
+
+ /*
+ * bdi_thresh is not treated as some limiting factor as
+ * dirty_thresh, due to reasons
+ * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in a system with HDD and USB key, the USB key may somehow
+ * go into state (bdi_dirty >> bdi_thresh) either because
+ * bdi_dirty starts high, or because bdi_thresh drops low.
+ * In this case we don't want to hard throttle the USB key
+ * dirtiers for 100 seconds until bdi_dirty drops under
+ * bdi_thresh. Instead the auxiliary bdi control line in
+ * bdi_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ */
+ *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+ if (bdi_bg_thresh)
+ *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
+ background_thresh,
+ dirty_thresh);
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
+ bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ *bdi_dirty = bdi_reclaimable +
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
+ } else {
+ bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ *bdi_dirty = bdi_reclaimable +
+ bdi_stat(bdi, BDI_WRITEBACK);
+ }
+}
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long pages_dirtied)
{
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
- unsigned long bdi_reclaimable;
unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
- unsigned long bdi_dirty;
- unsigned long freerun;
unsigned long background_thresh;
unsigned long dirty_thresh;
- unsigned long bdi_thresh;
long period;
long pause;
long max_pause;
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long dirty_ratelimit;
unsigned long pos_ratio;
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
unsigned long now = jiffies;
+ unsigned long uninitialized_var(bdi_thresh);
+ unsigned long thresh;
+ unsigned long uninitialized_var(bdi_dirty);
+ unsigned long dirty;
+ unsigned long bg_thresh;
/*
* Unstable writes are a feature of certain networked
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
global_dirty_limits(&background_thresh, &dirty_thresh);
+ if (unlikely(strictlimit)) {
+ bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+ &bdi_dirty, &bdi_thresh, &bg_thresh);
+
+ dirty = bdi_dirty;
+ thresh = bdi_thresh;
+ } else {
+ dirty = nr_dirty;
+ thresh = dirty_thresh;
+ bg_thresh = background_thresh;
+ }
+
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
- * when the bdi limits are ramping up.
+ * when the bdi limits are ramping up in case of !strictlimit.
+ *
+ * In strictlimit case make decision based on the bdi counters
+ * and limits. Small writeouts when the bdi limits are ramping
+ * up are the price we consciously pay for strictlimit-ing.
*/
- freerun = dirty_freerun_ceiling(dirty_thresh,
- background_thresh);
- if (nr_dirty <= freerun) {
+ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
current->nr_dirtied_pause =
- dirty_poll_interval(nr_dirty, dirty_thresh);
+ dirty_poll_interval(dirty, thresh);
break;
}
if (unlikely(!writeback_in_progress(bdi)))
bdi_start_background_writeback(bdi);
- /*
- * bdi_thresh is not treated as some limiting factor as
- * dirty_thresh, due to reasons
- * - in JBOD setup, bdi_thresh can fluctuate a lot
- * - in a system with HDD and USB key, the USB key may somehow
- * go into state (bdi_dirty >> bdi_thresh) either because
- * bdi_dirty starts high, or because bdi_thresh drops low.
- * In this case we don't want to hard throttle the USB key
- * dirtiers for 100 seconds until bdi_dirty drops under
- * bdi_thresh. Instead the auxiliary bdi control line in
- * bdi_position_ratio() will let the dirtier task progress
- * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
- */
- bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
- /*
- * In order to avoid the stacked BDI deadlock we need
- * to ensure we accurately count the 'dirty' pages when
- * the threshold is low.
- *
- * Otherwise it would be possible to get thresh+n pages
- * reported dirty, even though there are thresh-m pages
- * actually dirty; with m+n sitting in the percpu
- * deltas.
- */
- if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
- bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- bdi_dirty = bdi_reclaimable +
- bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else {
- bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_dirty = bdi_reclaimable +
- bdi_stat(bdi, BDI_WRITEBACK);
- }
+ if (!strictlimit)
+ bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+ &bdi_dirty, &bdi_thresh, NULL);
dirty_exceeded = (bdi_dirty > bdi_thresh) &&
- (nr_dirty > dirty_thresh);
+ ((nr_dirty > dirty_thresh) || strictlimit);
if (dirty_exceeded && !bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
@@ -2002,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for set_page_writeback family.
+ *
+ * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
+ * while calling this function.
+ * See test_set_page_writeback for example.
+ *
* NOTE: Unlike account_page_dirtied this does not rely on being atomic
* wrt interrupts.
*/
void account_page_writeback(struct page *page)
{
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
EXPORT_SYMBOL(account_page_writeback);
@@ -2223,7 +2370,10 @@ int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
+ bool locked;
+ unsigned long memcg_flags;
+ mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2244,9 +2394,11 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
+ mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}
@@ -2254,7 +2406,10 @@ int test_set_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
+ bool locked;
+ unsigned long memcg_flags;
+ mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2281,6 +2436,7 @@ int test_set_page_writeback(struct page *page)
}
if (!ret)
account_page_writeback(page);
+ mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}