summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2011-06-11 19:25:42 -0600
committerWu Fengguang <fengguang.wu@intel.com>2011-12-18 14:20:27 +0800
commit83712358ba0a1497ce59a4f84ce4dd0f803fe6fc (patch)
treed17ab27a7bff50616e3b63ad137c004d9ccfbcb0
parent32c7f202a4801252a0f3578807b75a961f792870 (diff)
writeback: dirty ratelimit - think time compensation
Compensate the task's think time when computing the final pause time, so that ->dirty_ratelimit can be executed accurately. think time := time spend outside of balance_dirty_pages() In the rare case that the task slept longer than the 200ms period time (result in negative pause time), the sleep time will be compensated in the following periods, too, if it's less than 1 second. Accumulated errors are carefully avoided as long as the max pause area is not hitted. Pseudo code: period = pages_dirtied / task_ratelimit; think = jiffies - dirty_paused_when; pause = period - think; 1) normal case: period > think pause = period - think dirty_paused_when = jiffies + pause nr_dirtied = 0 period time |===============================>| think time pause time |===============>|==============>| ------|----------------|---------------|------------------------ dirty_paused_when jiffies 2) no pause case: period <= think don't pause; reduce future pause time by: dirty_paused_when += period nr_dirtied = 0 period time |===============================>| think time |===================================================>| ------|--------------------------------+-------------------|---- dirty_paused_when jiffies Acked-by: Jan Kara <jack@suse.cz> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/trace/events/writeback.h14
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/page-writeback.c36
4 files changed, 45 insertions, 7 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc..984c3b29597 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,7 @@ struct task_struct {
*/
int nr_dirtied;
int nr_dirtied_pause;
+ unsigned long dirty_paused_when; /* start of a write-and-pause period */
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 99d1d0decf8..8588a891802 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
unsigned long dirty_ratelimit,
unsigned long task_ratelimit,
unsigned long dirtied,
+ unsigned long period,
long pause,
unsigned long start_time),
TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
dirty_ratelimit, task_ratelimit,
- dirtied, pause, start_time),
+ dirtied, period, pause, start_time),
TP_STRUCT__entry(
__array( char, bdi, 32)
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
__field(unsigned int, dirtied_pause)
__field(unsigned long, paused)
__field( long, pause)
+ __field(unsigned long, period)
+ __field( long, think)
),
TP_fast_assign(
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
__entry->task_ratelimit = KBps(task_ratelimit);
__entry->dirtied = dirtied;
__entry->dirtied_pause = current->nr_dirtied_pause;
+ __entry->think = current->dirty_paused_when == 0 ? 0 :
+ (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
+ __entry->period = period * 1000 / HZ;
__entry->pause = pause * 1000 / HZ;
__entry->paused = (jiffies - start_time) * 1000 / HZ;
),
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
"bdi_setpoint=%lu bdi_dirty=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
"dirtied=%u dirtied_pause=%u "
- "paused=%lu pause=%ld",
+ "paused=%lu pause=%ld period=%lu think=%ld",
__entry->bdi,
__entry->limit,
__entry->setpoint,
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
__entry->dirtied,
__entry->dirtied_pause,
__entry->paused, /* ms */
- __entry->pause /* ms */
+ __entry->pause, /* ms */
+ __entry->period, /* ms */
+ __entry->think /* ms */
)
);
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d08..f8668cf6a32 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+ p->dirty_paused_when = 0;
/*
* Ok, make it visible to the rest of the system.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96b3e7aa705..49193215582 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
+ long period;
long pause = 0;
long uninitialized_var(max_pause);
bool dirty_exceeded = false;
@@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long start_time = jiffies;
for (;;) {
+ unsigned long now = jiffies;
+
/*
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
@@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping,
*/
freerun = dirty_freerun_ceiling(dirty_thresh,
background_thresh);
- if (nr_dirty <= freerun)
+ if (nr_dirty <= freerun) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
break;
+ }
if (unlikely(!writeback_in_progress(bdi)))
bdi_start_background_writeback(bdi);
@@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping,
task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
RATELIMIT_CALC_SHIFT;
if (unlikely(task_ratelimit == 0)) {
+ period = max_pause;
pause = max_pause;
goto pause;
}
- pause = HZ * pages_dirtied / task_ratelimit;
+ period = HZ * pages_dirtied / task_ratelimit;
+ pause = period;
+ if (current->dirty_paused_when)
+ pause -= now - current->dirty_paused_when;
+ /*
+ * For less than 1s think time (ext3/4 may block the dirtier
+ * for up to 800ms from time to time on 1-HDD; so does xfs,
+ * however at much less frequency), try to compensate it in
+ * future periods by updating the virtual time; otherwise just
+ * do a reset, as it may be a light dirtier.
+ */
if (unlikely(pause <= 0)) {
trace_balance_dirty_pages(bdi,
dirty_thresh,
@@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
+ period,
pause,
start_time);
+ if (pause < -HZ) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ } else if (period) {
+ current->dirty_paused_when += period;
+ current->nr_dirtied = 0;
+ }
pause = 1; /* avoid resetting nr_dirtied_pause below */
break;
}
@@ -1135,11 +1160,15 @@ pause:
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
+ period,
pause,
start_time);
__set_current_state(TASK_KILLABLE);
io_schedule_timeout(pause);
+ current->dirty_paused_when = now + pause;
+ current->nr_dirtied = 0;
+
/*
* This is typically equal to (nr_dirty < dirty_thresh) and can
* also keep "1000+ dd on a slow USB stick" under control.
@@ -1167,11 +1196,10 @@ pause:
if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
- current->nr_dirtied = 0;
if (pause == 0) { /* in freerun area */
current->nr_dirtied_pause =
dirty_poll_interval(nr_dirty, dirty_thresh);
- } else if (pause <= max_pause / 4 &&
+ } else if (period <= max_pause / 4 &&
pages_dirtied >= current->nr_dirtied_pause) {
current->nr_dirtied_pause = clamp_val(
dirty_ratelimit * (max_pause / 2) / HZ,