From c8e28ce049faa53a470c132893abbc9f2bde9420 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 23 Jan 2011 10:07:47 -0600
Subject: writeback: account per-bdi accumulated dirtied pages

Introduce the BDI_DIRTIED counter. It will be used for estimating the
bdi's dirty bandwidth.

CC: Jan Kara <jack@suse.cz>
CC: Michael Rubin <mrubin@google.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/backing-dev.h')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3b2f9cb8298..9ca241a70c4 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -40,6 +40,7 @@ typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_DIRTIED,
 	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
-- 
cgit v1.2.3-70-g09d2


From be3ffa276446e1b691a2bf84e7621e5a6fb49db9 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 12 Jun 2011 10:51:31 -0600
Subject: writeback: dirty rate control

It's all about bdi->dirty_ratelimit, which aims to be (write_bw / N)
when there are N dd tasks.

On write() syscall, use bdi->dirty_ratelimit
============================================

    balance_dirty_pages(pages_dirtied)
    {
        task_ratelimit = bdi->dirty_ratelimit * bdi_position_ratio();
        pause = pages_dirtied / task_ratelimit;
        sleep(pause);
    }

On every 200ms, update bdi->dirty_ratelimit
===========================================

    bdi_update_dirty_ratelimit()
    {
        task_ratelimit = bdi->dirty_ratelimit * bdi_position_ratio();
        balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate;
        bdi->dirty_ratelimit = balanced_dirty_ratelimit
    }

Estimation of balanced bdi->dirty_ratelimit
===========================================

balanced task_ratelimit
-----------------------

balance_dirty_pages() needs to throttle tasks dirtying pages such that
the total amount of dirty pages stays below the specified dirty limit in
order to avoid memory deadlocks. Furthermore we desire fairness in that
tasks get throttled proportionally to the amount of pages they dirty.

IOW we want to throttle tasks such that we match the dirty rate to the
writeout bandwidth, this yields a stable amount of dirty pages:

        dirty_rate == write_bw                                          (1)

The fairness requirement gives us:

        task_ratelimit = balanced_dirty_ratelimit
                       == write_bw / N                                  (2)

where N is the number of dd tasks.  We don't know N beforehand, but
still can estimate balanced_dirty_ratelimit within 200ms.

Start by throttling each dd task at rate

        task_ratelimit = task_ratelimit_0                               (3)
                         (any non-zero initial value is OK)

After 200ms, we measured

        dirty_rate = # of pages dirtied by all dd's / 200ms
        write_bw   = # of pages written to the disk / 200ms

For the aggressive dd dirtiers, the equality holds

        dirty_rate == N * task_rate
                   == N * task_ratelimit_0                              (4)
Or
        task_ratelimit_0 == dirty_rate / N                              (5)

Now we conclude that the balanced task ratelimit can be estimated by

                                                      write_bw
        balanced_dirty_ratelimit = task_ratelimit_0 * ----------        (6)
                                                      dirty_rate

Because with (4) and (5) we can get the desired equality (1):

                                                       write_bw
        balanced_dirty_ratelimit == (dirty_rate / N) * ----------
                                                       dirty_rate
                                 == write_bw / N

Then using the balanced task ratelimit we can compute task pause times like:

        task_pause = task->nr_dirtied / task_ratelimit

task_ratelimit with position control
------------------------------------

However, while the above gives us means of matching the dirty rate to
the writeout bandwidth, it at best provides us with a stable dirty page
count (assuming a static system). In order to control the dirty page
count such that it is high enough to provide performance, but does not
exceed the specified limit we need another control.

The dirty position control works by extending (2) to

        task_ratelimit = balanced_dirty_ratelimit * pos_ratio           (7)

where pos_ratio is a negative feedback function that subjects to

1) f(setpoint) = 1.0
2) df/dx < 0

That is, if the dirty pages are ABOVE the setpoint, we throttle each
task a bit more HEAVY than balanced_dirty_ratelimit, so that the dirty
pages are created less fast than they are cleaned, thus DROP to the
setpoints (and the reverse).

Based on (7) and the assumption that both dirty_ratelimit and pos_ratio
remains CONSTANT for the past 200ms, we get

        task_ratelimit_0 = balanced_dirty_ratelimit * pos_ratio         (8)

Putting (8) into (6), we get the formula used in
bdi_update_dirty_ratelimit():

                                                write_bw
        balanced_dirty_ratelimit *= pos_ratio * ----------              (9)
                                                dirty_rate

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h |  7 ++++
 mm/backing-dev.c            |  1 +
 mm/page-writeback.c         | 83 +++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'include/linux/backing-dev.h')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 9ca241a70c4..dff0ff78e87 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -75,10 +75,17 @@ struct backing_dev_info {
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
 	unsigned long write_bandwidth;	/* the estimated write bandwidth */
 	unsigned long avg_write_bandwidth; /* further smoothed write bw */
 
+	/*
+	 * The base dirty throttle rate, re-calculated on every 200ms.
+	 * All the bdi tasks' dirty rate will be curbed under it.
+	 */
+	unsigned long dirty_ratelimit;
+
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index fea7e6efd1d..ba20f94cde9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -686,6 +686,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->bw_time_stamp = jiffies;
 	bdi->written_stamp = 0;
 
+	bdi->dirty_ratelimit = INIT_BW;
 	bdi->write_bandwidth = INIT_BW;
 	bdi->avg_write_bandwidth = INIT_BW;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4b954c9fe84..1721b6523c0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -777,6 +777,79 @@ static void global_update_bandwidth(unsigned long thresh,
 	spin_unlock(&dirty_lock);
 }
 
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+				       unsigned long thresh,
+				       unsigned long bg_thresh,
+				       unsigned long dirty,
+				       unsigned long bdi_thresh,
+				       unsigned long bdi_dirty,
+				       unsigned long dirtied,
+				       unsigned long elapsed)
+{
+	unsigned long write_bw = bdi->avg_write_bandwidth;
+	unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+	unsigned long dirty_rate;
+	unsigned long task_ratelimit;
+	unsigned long balanced_dirty_ratelimit;
+	unsigned long pos_ratio;
+
+	/*
+	 * The dirty rate will match the writeout rate in long term, except
+	 * when dirty pages are truncated by userspace or re-dirtied by FS.
+	 */
+	dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+	pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+				       bdi_thresh, bdi_dirty);
+	/*
+	 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+	 */
+	task_ratelimit = (u64)dirty_ratelimit *
+					pos_ratio >> RATELIMIT_CALC_SHIFT;
+	task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+
+	/*
+	 * A linear estimation of the "balanced" throttle rate. The theory is,
+	 * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+	 * dirty_rate will be measured to be (N * task_ratelimit). So the below
+	 * formula will yield the balanced rate limit (write_bw / N).
+	 *
+	 * Note that the expanded form is not a pure rate feedback:
+	 *	rate_(i+1) = rate_(i) * (write_bw / dirty_rate)		     (1)
+	 * but also takes pos_ratio into account:
+	 *	rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
+	 *
+	 * (1) is not realistic because pos_ratio also takes part in balancing
+	 * the dirty rate.  Consider the state
+	 *	pos_ratio = 0.5						     (3)
+	 *	rate = 2 * (write_bw / N)				     (4)
+	 * If (1) is used, it will stuck in that state! Because each dd will
+	 * be throttled at
+	 *	task_ratelimit = pos_ratio * rate = (write_bw / N)	     (5)
+	 * yielding
+	 *	dirty_rate = N * task_ratelimit = write_bw		     (6)
+	 * put (6) into (1) we get
+	 *	rate_(i+1) = rate_(i)					     (7)
+	 *
+	 * So we end up using (2) to always keep
+	 *	rate_(i+1) ~= (write_bw / N)				     (8)
+	 * regardless of the value of pos_ratio. As long as (8) is satisfied,
+	 * pos_ratio is able to drive itself to 1.0, which is not only where
+	 * the dirty count meet the setpoint, but also where the slope of
+	 * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+	 */
+	balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+					   dirty_rate | 1);
+
+	bdi->dirty_ratelimit = max(balanced_dirty_ratelimit, 1UL);
+}
+
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 			    unsigned long thresh,
 			    unsigned long bg_thresh,
@@ -787,6 +860,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 {
 	unsigned long now = jiffies;
 	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long dirtied;
 	unsigned long written;
 
 	/*
@@ -795,6 +869,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 	if (elapsed < BANDWIDTH_INTERVAL)
 		return;
 
+	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
 	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
 
 	/*
@@ -804,12 +879,16 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
 		goto snapshot;
 
-	if (thresh)
+	if (thresh) {
 		global_update_bandwidth(thresh, dirty, now);
-
+		bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+					   bdi_thresh, bdi_dirty,
+					   dirtied, elapsed);
+	}
 	bdi_update_write_bandwidth(bdi, elapsed, written);
 
 snapshot:
+	bdi->dirtied_stamp = dirtied;
 	bdi->written_stamp = written;
 	bdi->bw_time_stamp = now;
 }
-- 
cgit v1.2.3-70-g09d2


From 7381131cbcf7e15d201a0ffd782a4698efe4e740 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 26 Aug 2011 15:53:24 -0600
Subject: writeback: stabilize bdi->dirty_ratelimit

There are some imperfections in balanced_dirty_ratelimit.

1) large fluctuations

The dirty_rate used for computing balanced_dirty_ratelimit is merely
averaged in the past 200ms (very small comparing to the 3s estimation
period for write_bw), which makes rather dispersed distribution of
balanced_dirty_ratelimit.

It's pretty hard to average out the singular points by increasing the
estimation period. Considering that the averaging technique will
introduce very undesirable time lags, I give it up totally. (btw, the 3s
write_bw averaging time lag is much more acceptable because its impact
is one-way and therefore won't lead to oscillations.)

The more practical way is filtering -- most singular
balanced_dirty_ratelimit points can be filtered out by remembering some
prev_balanced_rate and prev_prev_balanced_rate. However the more
reliable way is to guard balanced_dirty_ratelimit with task_ratelimit.

2) due to truncates and fs redirties, the (write_bw <=> dirty_rate)
match could become unbalanced, which may lead to large systematical
errors in balanced_dirty_ratelimit. The truncates, due to its possibly
bumpy nature, can hardly be compensated smoothly. So let's face it. When
some over-estimated balanced_dirty_ratelimit brings dirty_ratelimit
high, dirty pages will go higher than the setpoint. task_ratelimit will
in turn become lower than dirty_ratelimit.  So if we consider both
balanced_dirty_ratelimit and task_ratelimit and update dirty_ratelimit
only when they are on the same side of dirty_ratelimit, the systematical
errors in balanced_dirty_ratelimit won't be able to bring
dirty_ratelimit far away.

The balanced_dirty_ratelimit estimation may also be inaccurate near
@limit or @freerun, however is less an issue.

3) since we ultimately want to

- keep the fluctuations of task ratelimit as small as possible
- keep the dirty pages around the setpoint as long time as possible

the update policy used for (2) also serves the above goals nicely:
if for some reason the dirty pages are high (task_ratelimit < dirty_ratelimit),
and dirty_ratelimit is low (dirty_ratelimit < balanced_dirty_ratelimit),
there is no point to bring up dirty_ratelimit in a hurry only to hurt
both the above two goals.

So, we make use of task_ratelimit to limit the update of dirty_ratelimit
in two ways:

1) avoid changing dirty rate when it's against the position control target
   (the adjusted rate will slow down the progress of dirty pages going
   back to setpoint).

2) limit the step size. task_ratelimit is changing values step by step,
   leaving a consistent trace comparing to the randomly jumping
   balanced_dirty_ratelimit. task_ratelimit also has the nice smaller
   errors in stable state and typically larger errors when there are big
   errors in rate.  So it's a pretty good limiting factor for the step
   size of dirty_ratelimit.

Note that bdi->dirty_ratelimit is always tracking balanced_dirty_ratelimit.
task_ratelimit is merely used as a limiting factor.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h |  3 ++
 mm/backing-dev.c            |  1 +
 mm/page-writeback.c         | 71 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/linux/backing-dev.h')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index dff0ff78e87..c3b92010d89 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -83,8 +83,11 @@ struct backing_dev_info {
 	/*
 	 * The base dirty throttle rate, re-calculated on every 200ms.
 	 * All the bdi tasks' dirty rate will be curbed under it.
+	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
+	 * in small steps and is much more smooth/stable than the latter.
 	 */
 	unsigned long dirty_ratelimit;
+	unsigned long balanced_dirty_ratelimit;
 
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ba20f94cde9..5dcaa3c756d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -686,6 +686,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->bw_time_stamp = jiffies;
 	bdi->written_stamp = 0;
 
+	bdi->balanced_dirty_ratelimit = INIT_BW;
 	bdi->dirty_ratelimit = INIT_BW;
 	bdi->write_bandwidth = INIT_BW;
 	bdi->avg_write_bandwidth = INIT_BW;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1721b6523c0..d4a6e91bd9e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -792,12 +792,17 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 				       unsigned long dirtied,
 				       unsigned long elapsed)
 {
+	unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+	unsigned long limit = hard_dirty_limit(thresh);
+	unsigned long setpoint = (freerun + limit) / 2;
 	unsigned long write_bw = bdi->avg_write_bandwidth;
 	unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
 	unsigned long dirty_rate;
 	unsigned long task_ratelimit;
 	unsigned long balanced_dirty_ratelimit;
 	unsigned long pos_ratio;
+	unsigned long step;
+	unsigned long x;
 
 	/*
 	 * The dirty rate will match the writeout rate in long term, except
@@ -847,7 +852,71 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
 					   dirty_rate | 1);
 
-	bdi->dirty_ratelimit = max(balanced_dirty_ratelimit, 1UL);
+	/*
+	 * We could safely do this and return immediately:
+	 *
+	 *	bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+	 *
+	 * However to get a more stable dirty_ratelimit, the below elaborated
+	 * code makes use of task_ratelimit to filter out sigular points and
+	 * limit the step size.
+	 *
+	 * The below code essentially only uses the relative value of
+	 *
+	 *	task_ratelimit - dirty_ratelimit
+	 *	= (pos_ratio - 1) * dirty_ratelimit
+	 *
+	 * which reflects the direction and size of dirty position error.
+	 */
+
+	/*
+	 * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+	 * task_ratelimit is on the same side of dirty_ratelimit, too.
+	 * For example, when
+	 * - dirty_ratelimit > balanced_dirty_ratelimit
+	 * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+	 * lowering dirty_ratelimit will help meet both the position and rate
+	 * control targets. Otherwise, don't update dirty_ratelimit if it will
+	 * only help meet the rate target. After all, what the users ultimately
+	 * feel and care are stable dirty rate and small position error.
+	 *
+	 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+	 * and filter out the sigular points of balanced_dirty_ratelimit. Which
+	 * keeps jumping around randomly and can even leap far away at times
+	 * due to the small 200ms estimation period of dirty_rate (we want to
+	 * keep that period small to reduce time lags).
+	 */
+	step = 0;
+	if (dirty < setpoint) {
+		x = min(bdi->balanced_dirty_ratelimit,
+			 min(balanced_dirty_ratelimit, task_ratelimit));
+		if (dirty_ratelimit < x)
+			step = x - dirty_ratelimit;
+	} else {
+		x = max(bdi->balanced_dirty_ratelimit,
+			 max(balanced_dirty_ratelimit, task_ratelimit));
+		if (dirty_ratelimit > x)
+			step = dirty_ratelimit - x;
+	}
+
+	/*
+	 * Don't pursue 100% rate matching. It's impossible since the balanced
+	 * rate itself is constantly fluctuating. So decrease the track speed
+	 * when it gets close to the target. Helps eliminate pointless tremors.
+	 */
+	step >>= dirty_ratelimit / (2 * step + 1);
+	/*
+	 * Limit the tracking speed to avoid overshooting.
+	 */
+	step = (step + 7) / 8;
+
+	if (dirty_ratelimit < balanced_dirty_ratelimit)
+		dirty_ratelimit += step;
+	else
+		dirty_ratelimit -= step;
+
+	bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+	bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 }
 
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-- 
cgit v1.2.3-70-g09d2


From 0e175a1835ffc979e55787774e58ec79e41957d7 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 7 Oct 2011 21:54:10 -0600
Subject: writeback: Add a 'reason' to wb_writeback_work

This creates a new 'reason' field in a wb_writeback_work
structure, which unambiguously identifies who initiates
writeback activity.  A 'wb_reason' enumeration has been
added to writeback.h, to enumerate the possible reasons.

The 'writeback_work_class' and tracepoint event class and
'writeback_queue_io' tracepoints are updated to include the
symbolic 'reason' in all trace events.

And the 'writeback_inodes_sbXXX' family of routines has had
a wb_stats parameter added to them, so callers can specify
why writeback is being started.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/btrfs/extent-tree.c           |  3 ++-
 fs/buffer.c                      |  2 +-
 fs/ext4/inode.c                  |  2 +-
 fs/fs-writeback.c                | 49 +++++++++++++++++++++++++++++-----------
 fs/quota/quota.c                 |  2 +-
 fs/sync.c                        |  4 ++--
 fs/ubifs/budget.c                |  2 +-
 include/linux/backing-dev.h      |  3 ++-
 include/linux/writeback.h        | 32 +++++++++++++++++++++-----
 include/trace/events/writeback.h | 14 ++++++++----
 mm/backing-dev.c                 |  3 ++-
 mm/page-writeback.c              |  3 ++-
 mm/vmscan.c                      |  3 ++-
 13 files changed, 88 insertions(+), 34 deletions(-)

(limited to 'include/linux/backing-dev.h')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462..c9ee0e18bbd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3340,7 +3340,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		smp_mb();
 		nr_pages = min_t(unsigned long, nr_pages,
 		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+						WB_REASON_FS_FREE_SPACE);
 
 		spin_lock(&space_info->lock);
 		if (reserved > space_info->bytes_reserved)
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade..f5dcee6c4cf 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -285,7 +285,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_flusher_threads(1024);
+	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 	yield();
 
 	for_each_online_node(nid) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f03..7fa73a3b212 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2241,7 +2241,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 	 * start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
 	if (free_blocks < 2 * dirty_blocks)
-		writeback_inodes_sb_if_idle(sb);
+		writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
 
 	return 0;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c5102969360..73c3992b2bb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
+	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
 };
 
+const char *wb_reason_name[] = {
+	[WB_REASON_BACKGROUND]		= "background",
+	[WB_REASON_TRY_TO_FREE_PAGES]	= "try_to_free_pages",
+	[WB_REASON_SYNC]		= "sync",
+	[WB_REASON_PERIODIC]		= "periodic",
+	[WB_REASON_LAPTOP_TIMER]	= "laptop_timer",
+	[WB_REASON_FREE_MORE_MEM]	= "free_more_memory",
+	[WB_REASON_FS_FREE_SPACE]	= "fs_free_space",
+	[WB_REASON_FORKER_THREAD]	= "forker_thread"
+};
+
 /*
  * Include the creation of the trace points after defining the
  * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic)
+		      bool range_cyclic, enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
+	work->reason	= reason;
 
 	bdi_queue_work(bdi, work);
 }
@@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  *   completion. Caller need not hold sb s_umount semaphore.
  *
  */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+			enum wb_reason reason)
 {
-	__bdi_start_writeback(bdi, nr_pages, true);
+	__bdi_start_writeback(bdi, nr_pages, true, reason);
 }
 
 /**
@@ -641,12 +655,14 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 	return wrote;
 }
 
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason)
 {
 	struct wb_writeback_work work = {
 		.nr_pages	= nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
+		.reason		= reason,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -825,6 +841,7 @@ static long wb_check_background_flush(struct bdi_writeback *wb)
 			.sync_mode	= WB_SYNC_NONE,
 			.for_background	= 1,
 			.range_cyclic	= 1,
+			.reason		= WB_REASON_BACKGROUND,
 		};
 
 		return wb_writeback(wb, &work);
@@ -858,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 			.sync_mode	= WB_SYNC_NONE,
 			.for_kupdate	= 1,
 			.range_cyclic	= 1,
+			.reason		= WB_REASON_PERIODIC,
 		};
 
 		return wb_writeback(wb, &work);
@@ -976,7 +994,7 @@ int bdi_writeback_thread(void *data)
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
@@ -989,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false);
+		__bdi_start_writeback(bdi, nr_pages, false, reason);
 	}
 	rcu_read_unlock();
 }
@@ -1210,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb)
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
@@ -1219,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 		.tagged_writepages	= 1,
 		.done			= &done,
 		.nr_pages		= nr,
+		.reason			= reason,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1235,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
-	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1248,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * Invoke writeback_inodes_sb if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int writeback_inodes_sb_if_idle(struct super_block *sb)
+int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
 		down_read(&sb->s_umount);
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, reason);
 		up_read(&sb->s_umount);
 		return 1;
 	} else
@@ -1269,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
  * Returns 1 if writeback was started, 0 if not.
  */
 int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-				   unsigned long nr)
+				   unsigned long nr,
+				   enum wb_reason reason)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
 		down_read(&sb->s_umount);
-		writeback_inodes_sb_nr(sb, nr);
+		writeback_inodes_sb_nr(sb, nr, reason);
 		up_read(&sb->s_umount);
 		return 1;
 	} else
@@ -1297,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb)
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
 		.done		= &done,
+		.reason		= WB_REASON_SYNC,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 10b6be3ca28..4bae57fc603 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		/* caller already holds s_umount */
 		if (sb->s_flags & MS_RDONLY)
 			return -EROFS;
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, WB_REASON_SYNC);
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edf..101b8ef901d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	if (wait)
 		sync_inodes_sb(sb);
 	else
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, WB_REASON_SYNC);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
  */
 SYSCALL_DEFINE0(sync)
 {
-	wakeup_flusher_threads(0);
+	wakeup_flusher_threads(0, WB_REASON_SYNC);
 	sync_filesystems(0);
 	sync_filesystems(1);
 	if (unlikely(laptop_mode))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b..bc4f94b2870 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
 	down_read(&c->vfs_sb->s_umount);
-	writeback_inodes_sb(c->vfs_sb);
+	writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
 	up_read(&c->vfs_sb->s_umount);
 }
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c3b92010d89..b1038bd686a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -118,7 +118,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index ddb4652cb33..a378c295851 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -38,6 +38,23 @@ enum writeback_sync_modes {
 	WB_SYNC_ALL,	/* Wait on every mapping */
 };
 
+/*
+ * why some writeback work was initiated
+ */
+enum wb_reason {
+	WB_REASON_BACKGROUND,
+	WB_REASON_TRY_TO_FREE_PAGES,
+	WB_REASON_SYNC,
+	WB_REASON_PERIODIC,
+	WB_REASON_LAPTOP_TIMER,
+	WB_REASON_FREE_MORE_MEM,
+	WB_REASON_FS_FREE_SPACE,
+	WB_REASON_FORKER_THREAD,
+
+	WB_REASON_MAX,
+};
+extern const char *wb_reason_name[];
+
 /*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
@@ -69,14 +86,17 @@ struct writeback_control {
  */	
 struct bdi_writeback;
 int inode_wait(void *);
-void writeback_inodes_sb(struct super_block *);
-void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
-int writeback_inodes_sb_if_idle(struct super_block *);
-int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
+void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+							enum wb_reason reason);
+int writeback_inodes_sb_if_idle(struct super_block *, enum wb_reason reason);
+int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr,
+							enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
-void wakeup_flusher_threads(long nr_pages);
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 1261db3916c..b99caa8b780 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -34,6 +34,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, for_kupdate)
 		__field(int, range_cyclic)
 		__field(int, for_background)
+		__field(int, reason)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
@@ -43,16 +44,18 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__entry->for_kupdate = work->for_kupdate;
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
+		__entry->reason = work->reason;
 	),
 	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-		  "kupdate=%d range_cyclic=%d background=%d",
+		  "kupdate=%d range_cyclic=%d background=%d reason=%s",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
 		  __entry->sync_mode,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
-		  __entry->for_background
+		  __entry->for_background,
+		  wb_reason_name[__entry->reason]
 	)
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
@@ -165,6 +168,7 @@ TRACE_EVENT(writeback_queue_io,
 		__field(unsigned long,	older)
 		__field(long,		age)
 		__field(int,		moved)
+		__field(int,		reason)
 	),
 	TP_fast_assign(
 		unsigned long *older_than_this = work->older_than_this;
@@ -173,12 +177,14 @@ TRACE_EVENT(writeback_queue_io,
 		__entry->age	= older_than_this ?
 				  (jiffies - *older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
+		__entry->reason	= work->reason;
 	),
-	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
 		__entry->name,
 		__entry->older,	/* older_than_this in jiffies */
 		__entry->age,	/* older_than_this in relative milliseconds */
-		__entry->moved)
+		__entry->moved,
+		wb_reason_name[__entry->reason])
 );
 
 TRACE_EVENT(global_dirty_state,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5dcaa3c756d..dd8916feb05 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -476,7 +476,8 @@ static int bdi_forker_thread(void *ptr)
 				 * the bdi from the thread. Hopefully 1024 is
 				 * large enough for efficient IO.
 				 */
-				writeback_inodes_wb(&bdi->wb, 1024);
+				writeback_inodes_wb(&bdi->wb, 1024,
+						    WB_REASON_FORKER_THREAD);
 			} else {
 				/*
 				 * The spinlock makes sure we do not lose
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 45d36f7dc16..650846b6158 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1301,7 +1301,8 @@ void laptop_mode_timer_fn(unsigned long data)
 	 * threshold
 	 */
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, nr_pages);
+		bdi_start_writeback(&q->backing_dev_info, nr_pages,
+					WB_REASON_LAPTOP_TIMER);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699cd906..c735bd770d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2181,7 +2181,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 */
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
-			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
 
-- 
cgit v1.2.3-70-g09d2