From c1955ce32fdb0877b7a1b22feb2669358f65be76 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 19 Jun 2010 23:08:06 +0200
Subject: writeback: remove wb_list

The wb_list member of struct backing_device_info always has exactly one
element.  Just use the direct bdi->wb pointer instead and simplify some
code.

Also remove bdi_task_init which is now trivial to prepare for the next
patch.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac9..d67989b8ba4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -73,9 +73,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 	 * If the default thread isn't there, make sure we add it. When
 	 * it gets created and wakes up, we'll run this work.
 	 */
-	if (unlikely(list_empty_careful(&bdi->wb_list)))
+	if (unlikely(!bdi->wb.task)) {
 		wake_up_process(default_backing_dev_info.wb.task);
-	else {
+	} else {
 		struct bdi_writeback *wb = &bdi->wb;
 
 		if (wb->task)
-- 
cgit v1.2.3-70-g09d2


From 082439004b31adc146e96e5f1c574dd2b57dcd93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 19 Jun 2010 23:08:22 +0200
Subject: writeback: merge bdi_writeback_task and bdi_start_fn

Move all code for the writeback thread into fs/fs-writeback.c instead of
splitting it over two functions in two files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/backing-dev.h |  2 +-
 mm/backing-dev.c            | 44 +-------------------------------------------
 3 files changed, 36 insertions(+), 45 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d67989b8ba4..c8471b3ddcc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -775,12 +775,36 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * wakes up periodically and does kupdated style flushing.
  */
-int bdi_writeback_task(struct bdi_writeback *wb)
+int bdi_writeback_thread(void *data)
 {
+	struct bdi_writeback *wb = data;
+	struct backing_dev_info *bdi = wb->bdi;
 	unsigned long last_active = jiffies;
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
+	/*
+	 * Add us to the active bdi_list
+	 */
+	spin_lock_bh(&bdi_lock);
+	list_add_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
+
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	set_freezable();
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(current, 0);
+
+	/*
+	 * Clear pending bit and wakeup anybody waiting to tear us down
+	 */
+	clear_bit(BDI_pending, &bdi->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&bdi->state, BDI_pending);
+
 	while (!kthread_should_stop()) {
 		pages_written = wb_do_writeback(wb, 0);
 
@@ -813,9 +837,18 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		try_to_freeze();
 	}
 
+	wb->task = NULL;
+
+	/*
+	 * Flush any work that raced with us exiting. No new work
+	 * will be added, since this bdi isn't discoverable anymore.
+	 */
+	if (!list_empty(&bdi->work_list))
+		wb_do_writeback(wb, 1);
 	return 0;
 }
 
+
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 50f14614616..e536f3a74e6 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -102,7 +102,7 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
-int bdi_writeback_task(struct bdi_writeback *wb);
+int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6c2a09c8922..bceac647e4d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -260,48 +260,6 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_more_io);
 }
 
-static int bdi_start_fn(void *ptr)
-{
-	struct bdi_writeback *wb = ptr;
-	struct backing_dev_info *bdi = wb->bdi;
-	int ret;
-
-	/*
-	 * Add us to the active bdi_list
-	 */
-	spin_lock_bh(&bdi_lock);
-	list_add_rcu(&bdi->bdi_list, &bdi_list);
-	spin_unlock_bh(&bdi_lock);
-
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
-	set_freezable();
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	/*
-	 * Clear pending bit and wakeup anybody waiting to tear us down
-	 */
-	clear_bit(BDI_pending, &bdi->state);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&bdi->state, BDI_pending);
-
-	ret = bdi_writeback_task(wb);
-
-	wb->task = NULL;
-
-	/*
-	 * Flush any work that raced with us exiting. No new work
-	 * will be added, since this bdi isn't discoverable anymore.
-	 */
-	if (!list_empty(&bdi->work_list))
-		wb_do_writeback(wb, 1);
-
-	return ret;
-}
-
 int bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
 	return wb_has_dirty_io(&bdi->wb);
@@ -425,7 +383,7 @@ static int bdi_forker_task(void *ptr)
 		spin_unlock_bh(&bdi_lock);
 
 		wb = &bdi->wb;
-		wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
+		wb->task = kthread_run(bdi_writeback_thread, wb, "flush-%s",
 					dev_name(bdi->dev));
 		/*
 		 * If task creation fails, then readd the bdi to
-- 
cgit v1.2.3-70-g09d2


From 455b2864686d3591b3b2f39eb46290c95f76471f Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 7 Jul 2010 13:24:06 +1000
Subject: writeback: Initial tracing support

Trace queue/sched/exec parts of the writeback loop. This provides
insight into when and why flusher threads are scheduled to run. e.g
a sync invocation leaves traces like:

     sync-[...]: writeback_queue: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0
flush-8:0-[...]: writeback_exec: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0

This also lays the foundation for adding more writeback tracing to
provide deeper insight into the whole writeback path.

The original tracing code is from Jens Axboe, though this version is
a rewrite as a result of the code being traced changing
significantly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c                | 38 +++++++++++++----
 include/trace/events/writeback.h | 91 ++++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c                 |  3 ++
 3 files changed, 124 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/events/writeback.h

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c8471b3ddcc..73acab4dc2b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,15 +26,9 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/tracepoint.h>
 #include "internal.h"
 
-#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
-
-/*
- * We don't actually have pdflush, but this one is exported though /proc...
- */
-int nr_pdflush_threads;
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -50,6 +44,21 @@ struct wb_writeback_work {
 	struct completion *done;	/* set if the caller waits */
 };
 
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure so that the definition remains local to this
+ * file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
+#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
+
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -65,6 +74,8 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 static void bdi_queue_work(struct backing_dev_info *bdi,
 		struct wb_writeback_work *work)
 {
+	trace_writeback_queue(bdi, work);
+
 	spin_lock(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
 	spin_unlock(&bdi->wb_lock);
@@ -74,6 +85,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 	 * it gets created and wakes up, we'll run this work.
 	 */
 	if (unlikely(!bdi->wb.task)) {
+		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -95,8 +107,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		if (bdi->wb.task)
+		if (bdi->wb.task) {
+			trace_writeback_nowork(bdi);
 			wake_up_process(bdi->wb.task);
+		}
 		return;
 	}
 
@@ -751,6 +765,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->sync_mode = WB_SYNC_ALL;
 
+		trace_writeback_exec(bdi, work);
+
 		wrote += wb_writeback(wb, work);
 
 		/*
@@ -805,9 +821,13 @@ int bdi_writeback_thread(void *data)
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bdi->state, BDI_pending);
 
+	trace_writeback_thread_start(bdi);
+
 	while (!kthread_should_stop()) {
 		pages_written = wb_do_writeback(wb, 0);
 
+		trace_writeback_pages_written(pages_written);
+
 		if (pages_written)
 			last_active = jiffies;
 		else if (wait_jiffies != -1UL) {
@@ -845,6 +865,8 @@ int bdi_writeback_thread(void *data)
 	 */
 	if (!list_empty(&bdi->work_list))
 		wb_do_writeback(wb, 1);
+
+	trace_writeback_thread_stop(bdi);
 	return 0;
 }
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
new file mode 100644
index 00000000000..562fcae10d9
--- /dev/null
+++ b/include/trace/events/writeback.h
@@ -0,0 +1,91 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM writeback
+
+#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WRITEBACK_H
+
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+
+struct wb_writeback_work;
+
+DECLARE_EVENT_CLASS(writeback_work_class,
+	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
+	TP_ARGS(bdi, work),
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(long, nr_pages)
+		__field(dev_t, sb_dev)
+		__field(int, sync_mode)
+		__field(int, for_kupdate)
+		__field(int, range_cyclic)
+		__field(int, for_background)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->nr_pages = work->nr_pages;
+		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
+		__entry->sync_mode = work->sync_mode;
+		__entry->for_kupdate = work->for_kupdate;
+		__entry->range_cyclic = work->range_cyclic;
+		__entry->for_background	= work->for_background;
+	),
+	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
+		  "kupdate=%d range_cyclic=%d background=%d",
+		  __entry->name,
+		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
+		  __entry->nr_pages,
+		  __entry->sync_mode,
+		  __entry->for_kupdate,
+		  __entry->range_cyclic,
+		  __entry->for_background
+	)
+);
+#define DEFINE_WRITEBACK_WORK_EVENT(name) \
+DEFINE_EVENT(writeback_work_class, name, \
+	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
+	TP_ARGS(bdi, work))
+DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
+
+TRACE_EVENT(writeback_pages_written,
+	TP_PROTO(long pages_written),
+	TP_ARGS(pages_written),
+	TP_STRUCT__entry(
+		__field(long,		pages)
+	),
+	TP_fast_assign(
+		__entry->pages		= pages_written;
+	),
+	TP_printk("%ld", __entry->pages)
+);
+
+DECLARE_EVENT_CLASS(writeback_class,
+	TP_PROTO(struct backing_dev_info *bdi),
+	TP_ARGS(bdi),
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+	),
+	TP_printk("bdi %s",
+		  __entry->name
+	)
+);
+#define DEFINE_WRITEBACK_EVENT(name) \
+DEFINE_EVENT(writeback_class, name, \
+	TP_PROTO(struct backing_dev_info *bdi), \
+	TP_ARGS(bdi))
+
+DEFINE_WRITEBACK_EVENT(writeback_nowork);
+DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
+DEFINE_WRITEBACK_EVENT(writeback_thread_start);
+DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
+
+#endif /* _TRACE_WRITEBACK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index bceac647e4d..ac78a333618 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
+#include <trace/events/writeback.h>
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
@@ -518,6 +519,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
+	trace_writeback_bdi_register(bdi);
 exit:
 	return ret;
 }
@@ -578,6 +580,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
+		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
 
 		if (!bdi_cap_flush_forker(bdi))
-- 
cgit v1.2.3-70-g09d2


From 028c2dd184c097809986684f2f0627eea5529fea Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 7 Jul 2010 13:24:07 +1000
Subject: writeback: Add tracing to balance_dirty_pages

Tracing high level background writeback events is good, but it doesn't
give the entire picture. Add visibility into write throttling to catch IO
dispatched by foreground throttling of processing dirtying lots of pages.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c                |  5 ++++
 include/trace/events/writeback.h | 64 ++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |  4 +++
 3 files changed, 73 insertions(+)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 73acab4dc2b..bf10cbf379d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -656,10 +656,14 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc, wb->bdi);
 		if (work->sb)
 			__writeback_inodes_sb(work->sb, wb, &wbc);
 		else
 			writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc, wb->bdi);
+
 		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -687,6 +691,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = list_entry(wb->b_more_io.prev,
 						struct inode, i_list);
+			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 562fcae10d9..0be26acae06 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -85,6 +85,70 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
 DEFINE_WRITEBACK_EVENT(writeback_thread_start);
 DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
 
+DECLARE_EVENT_CLASS(wbc_class,
+	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
+	TP_ARGS(wbc, bdi),
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(long, nr_to_write)
+		__field(long, pages_skipped)
+		__field(int, sync_mode)
+		__field(int, nonblocking)
+		__field(int, encountered_congestion)
+		__field(int, for_kupdate)
+		__field(int, for_background)
+		__field(int, for_reclaim)
+		__field(int, range_cyclic)
+		__field(int, more_io)
+		__field(unsigned long, older_than_this)
+		__field(long, range_start)
+		__field(long, range_end)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_background	= wbc->for_background;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->range_cyclic	= wbc->range_cyclic;
+		__entry->more_io	= wbc->more_io;
+		__entry->older_than_this = wbc->older_than_this ?
+						*wbc->older_than_this : 0;
+		__entry->range_start	= (long)wbc->range_start;
+		__entry->range_end	= (long)wbc->range_end;
+	),
+
+	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
+		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"start=0x%lx end=0x%lx",
+		__entry->name,
+		__entry->nr_to_write,
+		__entry->pages_skipped,
+		__entry->sync_mode,
+		__entry->for_kupdate,
+		__entry->for_background,
+		__entry->for_reclaim,
+		__entry->range_cyclic,
+		__entry->more_io,
+		__entry->older_than_this,
+		__entry->range_start,
+		__entry->range_end)
+)
+
+#define DEFINE_WBC_EVENT(name) \
+DEFINE_EVENT(wbc_class, name, \
+	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
+	TP_ARGS(wbc, bdi))
+DEFINE_WBC_EVENT(wbc_writeback_start);
+DEFINE_WBC_EVENT(wbc_writeback_written);
+DEFINE_WBC_EVENT(wbc_writeback_wait);
+DEFINE_WBC_EVENT(wbc_balance_dirty_start);
+DEFINE_WBC_EVENT(wbc_balance_dirty_written);
+DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef6154..d556cd829af 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -535,11 +536,13 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
+		trace_wbc_balance_dirty_start(&wbc, bdi);
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			writeback_inodes_wb(&bdi->wb, &wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			trace_wbc_balance_dirty_written(&wbc, bdi);
 		}
 
 		/*
@@ -565,6 +568,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
+		trace_wbc_balance_dirty_wait(&wbc, bdi);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule_timeout(pause);
 
-- 
cgit v1.2.3-70-g09d2


From 08852b6d6c40f387f2b75e199e2ca1df68970f4c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 3 Aug 2010 12:51:16 +0200
Subject: writeback: remove wb in get_next_work_item

83ba7b07 cleans up the writeback.
So we don't use wb any more in get_next_work_item.
Let's remove unnecessary argument.

CC: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bf10cbf379d..261570deb22 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -704,7 +704,7 @@ static long wb_writeback(struct bdi_writeback *wb,
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
 static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+get_next_work_item(struct backing_dev_info *bdi)
 {
 	struct wb_writeback_work *work = NULL;
 
@@ -762,7 +762,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
-	while ((work = get_next_work_item(bdi, wb)) != NULL) {
+	while ((work = get_next_work_item(bdi)) != NULL) {
 		/*
 		 * Override sync mode, in case we must wait for completion
 		 * because this thread is exiting now.
-- 
cgit v1.2.3-70-g09d2


From 6f904ff0e39ea88f81eb77e8dfb4e1238492f0a8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:11 +0300
Subject: writeback: harmonize writeback threads naming

The write-back code mixes words "thread" and "task" for the same things. This
is not a big deal, but still an inconsistency.

hch: a convention I tend to use and I've seen in various places
is to always use _task for the storage of the task_struct pointer,
and thread everywhere else.  This especially helps with having
foo_thread for the actual thread and foo_task for a global
variable keeping the task_struct pointer

This patch renames:
* 'bdi_add_default_flusher_task()' -> 'bdi_add_default_flusher_thread()'
* 'bdi_forker_task()'              -> 'bdi_forker_thread()'

because bdi threads are 'bdi_writeback_thread()', so these names are more
consistent.

This patch also amends commentaries and makes them refer the forker and bdi
threads as "thread", not "task".

Also, while on it, make 'bdi_add_default_flusher_thread()' declaration use
'static void' instead of 'void static' and make checkpatch.pl happy.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           |  2 +-
 include/linux/backing-dev.h |  2 +-
 mm/backing-dev.c            | 26 +++++++++++++-------------
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 261570deb22..002be0ff2ab 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -840,7 +840,7 @@ int bdi_writeback_thread(void *data)
 
 			/*
 			 * Longest period of inactivity that we tolerate. If we
-			 * see dirty data again later, the task will get
+			 * see dirty data again later, the thread will get
 			 * recreated automatically.
 			 */
 			max_idle = max(5UL * 60 * HZ, wait_jiffies);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e536f3a74e6..f0936f5f85d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -50,7 +50,7 @@ struct bdi_writeback {
 
 	unsigned long last_old_flush;		/* last old data flush */
 
-	struct task_struct	*task;		/* writeback task */
+	struct task_struct	*task;		/* writeback thread */
 	struct list_head	b_dirty;	/* dirty inodes */
 	struct list_head	b_io;		/* parked for writeback */
 	struct list_head	b_more_io;	/* parked for more writeback */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ac78a333618..4e9ed2a8521 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -50,7 +50,7 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
 
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
+static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi);
 
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
@@ -279,10 +279,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
 }
 
 /*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
  * or we risk deadlocking on ->s_umount. The longer term solution would be
  * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
  */
 static int bdi_sync_supers(void *unused)
 {
@@ -318,7 +318,7 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
-static int bdi_forker_task(void *ptr)
+static int bdi_forker_thread(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
 
@@ -354,7 +354,7 @@ static int bdi_forker_task(void *ptr)
 			    !bdi_has_dirty_io(bdi))
 				continue;
 
-			bdi_add_default_flusher_task(bdi);
+			bdi_add_default_flusher_thread(bdi);
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -376,7 +376,7 @@ static int bdi_forker_task(void *ptr)
 
 		/*
 		 * This is our real job - check for pending entries in
-		 * bdi_pending_list, and create the tasks that got added
+		 * bdi_pending_list, and create the threads that got added
 		 */
 		bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
 				 bdi_list);
@@ -387,7 +387,7 @@ static int bdi_forker_task(void *ptr)
 		wb->task = kthread_run(bdi_writeback_thread, wb, "flush-%s",
 					dev_name(bdi->dev));
 		/*
-		 * If task creation fails, then readd the bdi to
+		 * If thread creation fails, then readd the bdi to
 		 * the pending list and force writeout of the bdi
 		 * from this forker thread. That will free some memory
 		 * and we can try again.
@@ -430,10 +430,10 @@ static void bdi_add_to_pending(struct rcu_head *head)
 }
 
 /*
- * Add the default flusher task that gets created for any bdi
+ * Add the default flusher thread that gets created for any bdi
  * that has dirty data pending writeout
  */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi)
 {
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
@@ -445,10 +445,10 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
 	}
 
 	/*
-	 * Check with the helper whether to proceed adding a task. Will only
+	 * Check with the helper whether to proceed adding a thread. Will only
 	 * abort if we two or more simultanous calls to
-	 * bdi_add_default_flusher_task() occured, further additions will block
-	 * waiting for previous additions to finish.
+	 * bdi_add_default_flusher_thread() occured, further additions will
+	 * block waiting for previous additions to finish.
 	 */
 	if (!test_and_set_bit(BDI_pending, &bdi->state)) {
 		list_del_rcu(&bdi->bdi_list);
@@ -506,7 +506,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	if (bdi_cap_flush_forker(bdi)) {
 		struct bdi_writeback *wb = &bdi->wb;
 
-		wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
 						dev_name(dev));
 		if (IS_ERR(wb->task)) {
 			wb->task = NULL;
-- 
cgit v1.2.3-70-g09d2


From 297252c81de8043ca6c36e5984c24fdb5aab9013 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:15 +0300
Subject: writeback: do not lose wake-ups in bdi threads

Currently, bdi threads ('bdi_writeback_thread()') can lose wake-ups. For
example, if 'bdi_queue_work()' is executed after the bdi thread have had
finished 'wb_do_writeback()' but before it called
'schedule_timeout_interruptible()'.

To fix this issue, we have to check whether we have works to process after we
have changed the task state to 'TASK_INTERRUPTIBLE'.

This patch also clean-ups handling of the cases when 'dirty_writeback_interval'
is zero or non-zero.

Additionally, this patch also removes unneeded 'list_empty_careful()' call.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 002be0ff2ab..05444eaa3f3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -848,17 +848,18 @@ int bdi_writeback_thread(void *data)
 				break;
 		}
 
-		if (dirty_writeback_interval) {
-			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-			schedule_timeout_interruptible(wait_jiffies);
-		} else {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (list_empty_careful(&wb->bdi->work_list) &&
-			    !kthread_should_stop())
-				schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!list_empty(&bdi->work_list)) {
 			__set_current_state(TASK_RUNNING);
+			continue;
 		}
 
+		if (dirty_writeback_interval) {
+			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+			schedule_timeout(wait_jiffies);
+		} else
+			schedule();
+
 		try_to_freeze();
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 78c40cb6581a74adc48821f3de6b864a54d4c34d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:17 +0300
Subject: writeback: do not remove bdi from bdi_list

The forker thread removes bdis from 'bdi_list' before forking the bdi thread.
But this is wrong for at least 2 reasons.

Reason #1: if we temporary remove a bdi from the list, we may miss works which
           would otherwise be given to us.

Reason #2: this is racy; indeed, 'bdi_wb_shutdown()' expects that bdis are
           always in the 'bdi_list' (see 'bdi_remove_from_list()'), and when
           it races with the forker thread, it can shut down the bdi thread
           at the same time as the forker creates it.

This patch makes sure the forker thread never removes bdis from 'bdi_list'
(which was suggested by Christoph Hellwig).

In order to make sure that we do not race with 'bdi_wb_shutdown()', we have to
hold the 'bdi_lock' while walking the 'bdi_list' and setting the 'BDI_pending'
flag.

NOTE! The error path is interesting. Currently, when we fail to create a bdi
thread, we move the bdi to the tail of 'bdi_list'. But if we never remove the
bdi from the list, we cannot move it to the tail either, because then we can
mess up the RCU readers which walk the list. And also, we'll have the race
described above in "Reason #2".

But I not think that adding to the tail is any important so I just do not do
that.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c |  7 -------
 mm/backing-dev.c  | 31 ++++++++++---------------------
 2 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05444eaa3f3..57fbfd0ebc5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -804,13 +804,6 @@ int bdi_writeback_thread(void *data)
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
-	/*
-	 * Add us to the active bdi_list
-	 */
-	spin_lock_bh(&bdi_lock);
-	list_add_rcu(&bdi->bdi_list, &bdi_list);
-	spin_unlock_bh(&bdi_lock);
-
 	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	set_freezable();
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dbc66815a0f..672c17bb32d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -331,7 +331,7 @@ static int bdi_forker_thread(void *ptr)
 	for (;;) {
 		bool fork = false;
 		struct task_struct *task;
-		struct backing_dev_info *bdi, *tmp;
+		struct backing_dev_info *bdi;
 
 		/*
 		 * Temporary measure, we want to make sure we don't see
@@ -347,7 +347,7 @@ static int bdi_forker_thread(void *ptr)
 		 * Check if any existing bdi's have dirty data without
 		 * a thread registered. If so, set that up.
 		 */
-		list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+		list_for_each_entry(bdi, &bdi_list, bdi_list) {
 			if (!bdi_cap_writeback_dirty(bdi))
 				continue;
 			if (bdi->wb.task)
@@ -359,8 +359,13 @@ static int bdi_forker_thread(void *ptr)
 			WARN(!test_bit(BDI_registered, &bdi->state),
 			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
 
-			list_del_rcu(&bdi->bdi_list);
 			fork = true;
+
+			/*
+			 * Set the pending bit - if someone will try to
+			 * unregister this bdi - it'll wait on this bit.
+			 */
+			set_bit(BDI_pending, &bdi->state);
 			break;
 		}
 		spin_unlock_bh(&bdi_lock);
@@ -383,29 +388,13 @@ static int bdi_forker_thread(void *ptr)
 
 		__set_current_state(TASK_RUNNING);
 
-		/*
-		 * Set the pending bit - if someone will try to unregister this
-		 * bdi - it'll wait on this bit.
-		 */
-		set_bit(BDI_pending, &bdi->state);
-
-		/* Make sure no one uses the picked bdi */
-		synchronize_rcu();
-
 		task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
 				   dev_name(bdi->dev));
 		if (IS_ERR(task)) {
 			/*
-			 * If thread creation fails, then readd the bdi back to
-			 * the list and force writeout of the bdi from this
-			 * forker thread. That will free some memory and we can
-			 * try again. Add it to the tail so we get a chance to
-			 * flush other bdi's to free memory.
+			 * If thread creation fails, force writeout of the bdi
+			 * from the thread.
 			 */
-			spin_lock_bh(&bdi_lock);
-			list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-			spin_unlock_bh(&bdi_lock);
-
 			bdi_flush_io(bdi);
 		} else
 			bdi->wb.task = task;
-- 
cgit v1.2.3-70-g09d2


From ecd584030da67ede1bf17955746a6ce834d9fc6b Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:18 +0300
Subject: writeback: move last_active to bdi

Currently bdi threads use local variable 'last_active' which stores last time
when the bdi thread did some useful work. Move this local variable to 'struct
bdi_writeback'. This is just a preparation for the further patches which will
make the forker thread decide when bdi threads should be killed.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           |  6 +++---
 include/linux/backing-dev.h | 13 +++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 57fbfd0ebc5..9f5cab75c15 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -800,12 +800,12 @@ int bdi_writeback_thread(void *data)
 {
 	struct bdi_writeback *wb = data;
 	struct backing_dev_info *bdi = wb->bdi;
-	unsigned long last_active = jiffies;
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
 	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	set_freezable();
+	wb->last_active = jiffies;
 
 	/*
 	 * Our parent may run at a different priority, just set us to normal
@@ -827,7 +827,7 @@ int bdi_writeback_thread(void *data)
 		trace_writeback_pages_written(pages_written);
 
 		if (pages_written)
-			last_active = jiffies;
+			wb->last_active = jiffies;
 		else if (wait_jiffies != -1UL) {
 			unsigned long max_idle;
 
@@ -837,7 +837,7 @@ int bdi_writeback_thread(void *data)
 			 * recreated automatically.
 			 */
 			max_idle = max(5UL * 60 * HZ, wait_jiffies);
-			if (time_after(jiffies, max_idle + last_active))
+			if (time_after(jiffies, max_idle + wb->last_active))
 				break;
 		}
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 95ecb2bebca..71b6223e0a7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -45,15 +45,16 @@ enum bdi_stat_item {
 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
 struct bdi_writeback {
-	struct backing_dev_info *bdi;		/* our parent bdi */
+	struct backing_dev_info *bdi;	/* our parent bdi */
 	unsigned int nr;
 
-	unsigned long last_old_flush;		/* last old data flush */
+	unsigned long last_old_flush;	/* last old data flush */
+	unsigned long last_active;	/* last time bdi thread was active */
 
-	struct task_struct	*task;		/* writeback thread */
-	struct list_head	b_dirty;	/* dirty inodes */
-	struct list_head	b_io;		/* parked for writeback */
-	struct list_head	b_more_io;	/* parked for more writeback */
+	struct task_struct *task;	/* writeback thread */
+	struct list_head b_dirty;	/* dirty inodes */
+	struct list_head b_io;		/* parked for writeback */
+	struct list_head b_more_io;	/* parked for more writeback */
 };
 
 struct backing_dev_info {
-- 
cgit v1.2.3-70-g09d2


From fff5b85aa4225a7be157f208277a055822039a9e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:20 +0300
Subject: writeback: move bdi threads exiting logic to the forker thread

Currently, bdi threads can decide to exit if there were no useful activities
for 5 minutes. However, this causes nasty races: we can easily oops in the
'bdi_queue_work()' if the bdi thread decides to exit while we are waking it up.

And even if we do not oops, but the bdi tread exits immediately after we wake
it up, we'd lose the wake-up event and have an unnecessary delay (up to 5 secs)
in the bdi work processing.

This patch makes the forker thread to be the central place which not only
creates bdi threads, but also kills them if they were inactive long enough.
This better design-wise.

Another reason why this change was done is to prepare for the further changes
which will prevent the bdi threads from waking up every 5 sec and wasting
power. Indeed, when the task does not wake up periodically anymore, it won't be
able to exit either.

This patch also moves the the 'wake_up_bit()' call from the bdi thread to the
forker thread as well. So now the forker thread sets the BDI_pending bit, then
forks the task or kills it, then clears the bit and wakes up the waiting
process.

The only process which may wain on the bit is 'bdi_wb_shutdown()'. This
function was changed as well - now it first removes the bdi from the
'bdi_list', then waits on the 'BDI_pending' bit. Once it wakes up, it is
guaranteed that the forker thread won't race with it, because the bdi is not
visible. Note, the forker thread sets the 'BDI_pending' bit under the
'bdi->wb_lock' which is essential for proper serialization.

And additionally, when we change 'bdi->wb.task', we now take the
'bdi->work_lock', to make sure that we do not lose wake-ups which we otherwise
would when raced with, say, 'bdi_queue_work()'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 54 ++++++++++---------------------------------
 mm/backing-dev.c  | 69 ++++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 70 insertions(+), 53 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f5cab75c15..905f3ea3848 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -78,21 +78,17 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 	spin_lock(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
-	spin_unlock(&bdi->wb_lock);
-
-	/*
-	 * If the default thread isn't there, make sure we add it. When
-	 * it gets created and wakes up, we'll run this work.
-	 */
-	if (unlikely(!bdi->wb.task)) {
+	if (bdi->wb.task) {
+		wake_up_process(bdi->wb.task);
+	} else {
+		/*
+		 * The bdi thread isn't there, wake up the forker thread which
+		 * will create and run it.
+		 */
 		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
-	} else {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		if (wb->task)
-			wake_up_process(wb->task);
 	}
+	spin_unlock(&bdi->wb_lock);
 }
 
 static void
@@ -800,7 +796,6 @@ int bdi_writeback_thread(void *data)
 {
 	struct bdi_writeback *wb = data;
 	struct backing_dev_info *bdi = wb->bdi;
-	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
 	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
@@ -812,13 +807,6 @@ int bdi_writeback_thread(void *data)
 	 */
 	set_user_nice(current, 0);
 
-	/*
-	 * Clear pending bit and wakeup anybody waiting to tear us down
-	 */
-	clear_bit(BDI_pending, &bdi->state);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&bdi->state, BDI_pending);
-
 	trace_writeback_thread_start(bdi);
 
 	while (!kthread_should_stop()) {
@@ -828,18 +816,6 @@ int bdi_writeback_thread(void *data)
 
 		if (pages_written)
 			wb->last_active = jiffies;
-		else if (wait_jiffies != -1UL) {
-			unsigned long max_idle;
-
-			/*
-			 * Longest period of inactivity that we tolerate. If we
-			 * see dirty data again later, the thread will get
-			 * recreated automatically.
-			 */
-			max_idle = max(5UL * 60 * HZ, wait_jiffies);
-			if (time_after(jiffies, max_idle + wb->last_active))
-				break;
-		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!list_empty(&bdi->work_list)) {
@@ -847,21 +823,15 @@ int bdi_writeback_thread(void *data)
 			continue;
 		}
 
-		if (dirty_writeback_interval) {
-			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-			schedule_timeout(wait_jiffies);
-		} else
+		if (dirty_writeback_interval)
+			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+		else
 			schedule();
 
 		try_to_freeze();
 	}
 
-	wb->task = NULL;
-
-	/*
-	 * Flush any work that raced with us exiting. No new work
-	 * will be added, since this bdi isn't discoverable anymore.
-	 */
+	/* Flush any work that raced with us exiting */
 	if (!list_empty(&bdi->work_list))
 		wb_do_writeback(wb, 1);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e104e32c2ee..9c1c199f88c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -316,6 +316,18 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+	return max(5UL * 60 * HZ, interval);
+}
+
 static int bdi_forker_thread(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
@@ -329,11 +341,12 @@ static int bdi_forker_thread(void *ptr)
 	set_user_nice(current, 0);
 
 	for (;;) {
-		struct task_struct *task;
+		struct task_struct *task = NULL;
 		struct backing_dev_info *bdi;
 		enum {
 			NO_ACTION,   /* Nothing to do */
 			FORK_THREAD, /* Fork bdi thread */
+			KILL_THREAD, /* Kill inactive bdi thread */
 		} action = NO_ACTION;
 
 		/*
@@ -346,10 +359,6 @@ static int bdi_forker_thread(void *ptr)
 		spin_lock_bh(&bdi_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		/*
-		 * Check if any existing bdi's have dirty data without
-		 * a thread registered. If so, set that up.
-		 */
 		list_for_each_entry(bdi, &bdi_list, bdi_list) {
 			bool have_dirty_io;
 
@@ -376,6 +385,25 @@ static int bdi_forker_thread(void *ptr)
 				action = FORK_THREAD;
 				break;
 			}
+
+			spin_lock(&bdi->wb_lock);
+			/*
+			 * If there is no work to do and the bdi thread was
+			 * inactive long enough - kill it. The wb_lock is taken
+			 * to make sure no-one adds more work to this bdi and
+			 * wakes the bdi thread up.
+			 */
+			if (bdi->wb.task && !have_dirty_io &&
+			    time_after(jiffies, bdi->wb.last_active +
+						bdi_longest_inactive())) {
+				task = bdi->wb.task;
+				bdi->wb.task = NULL;
+				spin_unlock(&bdi->wb_lock);
+				set_bit(BDI_pending, &bdi->state);
+				action = KILL_THREAD;
+				break;
+			}
+			spin_unlock(&bdi->wb_lock);
 		}
 		spin_unlock_bh(&bdi_lock);
 
@@ -394,8 +422,20 @@ static int bdi_forker_thread(void *ptr)
 				 * the bdi from the thread.
 				 */
 				bdi_flush_io(bdi);
-			} else
+			} else {
+				/*
+				 * The spinlock makes sure we do not lose
+				 * wake-ups when racing with 'bdi_queue_work()'.
+				 */
+				spin_lock(&bdi->wb_lock);
 				bdi->wb.task = task;
+				spin_unlock(&bdi->wb_lock);
+			}
+			break;
+
+		case KILL_THREAD:
+			__set_current_state(TASK_RUNNING);
+			kthread_stop(task);
 			break;
 
 		case NO_ACTION:
@@ -407,6 +447,13 @@ static int bdi_forker_thread(void *ptr)
 			/* Back to the main loop */
 			continue;
 		}
+
+		/*
+		 * Clear pending bit and wakeup anybody waiting to tear us down.
+		 */
+		clear_bit(BDI_pending, &bdi->state);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&bdi->state, BDI_pending);
 	}
 
 	return 0;
@@ -490,15 +537,15 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 		return;
 
 	/*
-	 * If setup is pending, wait for that to complete first
+	 * Make sure nobody finds us on the bdi_list anymore
 	 */
-	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
-			TASK_UNINTERRUPTIBLE);
+	bdi_remove_from_list(bdi);
 
 	/*
-	 * Make sure nobody finds us on the bdi_list anymore
+	 * If setup is pending, wait for that to complete first
 	 */
-	bdi_remove_from_list(bdi);
+	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+			TASK_UNINTERRUPTIBLE);
 
 	/*
 	 * Finally, kill the kernel thread. We don't need to be RCU
-- 
cgit v1.2.3-70-g09d2


From 253c34e9b10c30d3064be654b5b78fbc1a8b1896 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:21 +0300
Subject: writeback: prevent unnecessary bdi threads wakeups

Finally, we can get rid of unnecessary wake-ups in bdi threads, which are very
bad for battery-driven devices.

There are two types of activities bdi threads do:
1. process bdi works from the 'bdi->work_list'
2. periodic write-back

So there are 2 sources of wake-up events for bdi threads:

1. 'bdi_queue_work()' - submits bdi works
2. '__mark_inode_dirty()' - adds dirty I/O to bdi's

The former already has bdi wake-up code. The latter does not, and this patch
adds it.

'__mark_inode_dirty()' is hot-path function, but this patch adds another
'spin_lock(&bdi->wb_lock)' there. However, it is taken only in rare cases when
the bdi has no dirty inodes. So adding this spinlock should be fine and should
not affect performance.

This patch makes sure bdi threads and the forker thread do not wake-up if there
is nothing to do. The forker thread will nevertheless wake up at least every
5 min. to check whether it has to kill a bdi thread. This can also be optimized,
but is not worth it.

This patch also tidies up the warning about unregistered bid, and turns it from
an ugly crocodile to a simple 'WARN()' statement.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-----------
 mm/backing-dev.c  | 13 +++++++++---
 2 files changed, 58 insertions(+), 14 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 905f3ea3848..55f6e46e06f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -823,10 +823,16 @@ int bdi_writeback_thread(void *data)
 			continue;
 		}
 
-		if (dirty_writeback_interval)
+		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
 			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-		else
+		else {
+			/*
+			 * We have nothing to do, so can go sleep without any
+			 * timeout and save power. When a work is queued or
+			 * something is made dirty - we will be woken up.
+			 */
 			schedule();
+		}
 
 		try_to_freeze();
 	}
@@ -862,6 +868,26 @@ void wakeup_flusher_threads(long nr_pages)
 	rcu_read_unlock();
 }
 
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes.
+ */
+static void wakeup_bdi_thread(struct backing_dev_info *bdi)
+{
+	spin_lock(&bdi->wb_lock);
+	if (bdi->wb.task)
+		wake_up_process(bdi->wb.task);
+	else
+		/*
+		 * When bdi tasks are inactive for long time, they are killed.
+		 * In this case we have to wake-up the forker thread which
+		 * should create and run the bdi thread.
+		 */
+		wake_up_process(default_backing_dev_info.wb.task);
+	spin_unlock(&bdi->wb_lock);
+}
+
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -914,6 +940,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
+	struct backing_dev_info *bdi = NULL;
+	bool wakeup_bdi = false;
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -967,22 +995,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
-			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-			struct backing_dev_info *bdi = wb->bdi;
-
-			if (bdi_cap_writeback_dirty(bdi) &&
-			    !test_bit(BDI_registered, &bdi->state)) {
-				WARN_ON(1);
-				printk(KERN_ERR "bdi-%s not registered\n",
-								bdi->name);
+			bdi = inode_to_bdi(inode);
+
+			if (bdi_cap_writeback_dirty(bdi)) {
+				WARN(!test_bit(BDI_registered, &bdi->state),
+				     "bdi-%s not registered\n", bdi->name);
+
+				/*
+				 * If this is the first dirty inode for this
+				 * bdi, we have to wake-up the corresponding
+				 * bdi thread to make sure background
+				 * write-back happens later.
+				 */
+				if (!wb_has_dirty_io(&bdi->wb))
+					wakeup_bdi = true;
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &wb->b_dirty);
+			list_move(&inode->i_list, &bdi->wb.b_dirty);
 		}
 	}
 out:
 	spin_unlock(&inode_lock);
+
+	if (wakeup_bdi)
+		wakeup_bdi_thread(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9c1c199f88c..a9a08d88a74 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -439,10 +439,17 @@ static int bdi_forker_thread(void *ptr)
 			break;
 
 		case NO_ACTION:
-			if (dirty_writeback_interval)
-				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+				/*
+				 * There are no dirty data. The only thing we
+				 * should now care about is checking for
+				 * inactive bdi threads and killing them. Thus,
+				 * let's sleep for longer time, save energy and
+				 * be friendly for battery-driven devices.
+				 */
+				schedule_timeout(bdi_longest_inactive());
 			else
-				schedule();
+				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
 			try_to_freeze();
 			/* Back to the main loop */
 			continue;
-- 
cgit v1.2.3-70-g09d2


From 6467716a37673e8d47b4984eb19839bdad0a8353 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:22 +0300
Subject: writeback: optimize periodic bdi thread wakeups

Whe the first inode for a bdi is marked dirty, we wake up the bdi thread which
should take care of the periodic background write-out. However, the write-out
will actually start only 'dirty_writeback_interval' centisecs later, so we can
delay the wake-up.

This change was requested by Nick Piggin who pointed out that if we delay the
wake-up, we weed out 2 unnecessary contex switches, which matters because
'__mark_inode_dirty()' is a hot-path function.

This patch introduces a new function - 'bdi_wakeup_thread_delayed()', which
sets up a timer to wake-up the bdi thread and returns. So the wake-up is
delayed.

We also delete the timer in bdi threads just before writing-back. And
synchronously delete it when unregistering bdi. At the unregister point the bdi
does not have any users, so no one can arm it again.

Since now we take 'bdi->wb_lock' in the timer, which can execute in softirq
context, we have to use 'spin_lock_bh()' for 'bdi->wb_lock'. This patch makes
this change as well.

This patch also moves the 'bdi_wb_init()' function down in the file to avoid
forward-declaration of 'bdi_wakeup_thread_delayed()'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           | 36 +++++++---------------
 include/linux/backing-dev.h |  2 ++
 mm/backing-dev.c            | 73 +++++++++++++++++++++++++++++++++++----------
 3 files changed, 70 insertions(+), 41 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 55f6e46e06f..bfa2df2c7ce 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -76,7 +76,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 {
 	trace_writeback_queue(bdi, work);
 
-	spin_lock(&bdi->wb_lock);
+	spin_lock_bh(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
 	if (bdi->wb.task) {
 		wake_up_process(bdi->wb.task);
@@ -88,7 +88,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
-	spin_unlock(&bdi->wb_lock);
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
@@ -704,13 +704,13 @@ get_next_work_item(struct backing_dev_info *bdi)
 {
 	struct wb_writeback_work *work = NULL;
 
-	spin_lock(&bdi->wb_lock);
+	spin_lock_bh(&bdi->wb_lock);
 	if (!list_empty(&bdi->work_list)) {
 		work = list_entry(bdi->work_list.next,
 				  struct wb_writeback_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock(&bdi->wb_lock);
+	spin_unlock_bh(&bdi->wb_lock);
 	return work;
 }
 
@@ -810,6 +810,12 @@ int bdi_writeback_thread(void *data)
 	trace_writeback_thread_start(bdi);
 
 	while (!kthread_should_stop()) {
+		/*
+		 * Remove own delayed wake-up timer, since we are already awake
+		 * and we'll take care of the preriodic write-back.
+		 */
+		del_timer(&wb->wakeup_timer);
+
 		pages_written = wb_do_writeback(wb, 0);
 
 		trace_writeback_pages_written(pages_written);
@@ -868,26 +874,6 @@ void wakeup_flusher_threads(long nr_pages)
 	rcu_read_unlock();
 }
 
-/*
- * This function is used when the first inode for this bdi is marked dirty. It
- * wakes-up the corresponding bdi thread which should then take care of the
- * periodic background write-out of dirty inodes.
- */
-static void wakeup_bdi_thread(struct backing_dev_info *bdi)
-{
-	spin_lock(&bdi->wb_lock);
-	if (bdi->wb.task)
-		wake_up_process(bdi->wb.task);
-	else
-		/*
-		 * When bdi tasks are inactive for long time, they are killed.
-		 * In this case we have to wake-up the forker thread which
-		 * should create and run the bdi thread.
-		 */
-		wake_up_process(default_backing_dev_info.wb.task);
-	spin_unlock(&bdi->wb_lock);
-}
-
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1019,7 +1005,7 @@ out:
 	spin_unlock(&inode_lock);
 
 	if (wakeup_bdi)
-		wakeup_bdi_thread(bdi);
+		bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 71b6223e0a7..7628219e538 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -52,6 +52,7 @@ struct bdi_writeback {
 	unsigned long last_active;	/* last time bdi thread was active */
 
 	struct task_struct *task;	/* writeback thread */
+	struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
@@ -105,6 +106,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a9a08d88a74..cfff7225138 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -248,17 +248,6 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
-	memset(wb, 0, sizeof(*wb));
-
-	wb->bdi = bdi;
-	wb->last_old_flush = jiffies;
-	INIT_LIST_HEAD(&wb->b_dirty);
-	INIT_LIST_HEAD(&wb->b_io);
-	INIT_LIST_HEAD(&wb->b_more_io);
-}
-
 int bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
 	return wb_has_dirty_io(&bdi->wb);
@@ -316,6 +305,43 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
+static void wakeup_timer_fn(unsigned long data)
+{
+	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+
+	spin_lock_bh(&bdi->wb_lock);
+	if (bdi->wb.task) {
+		wake_up_process(bdi->wb.task);
+	} else {
+		/*
+		 * When bdi tasks are inactive for long time, they are killed.
+		 * In this case we have to wake-up the forker thread which
+		 * should create and run the bdi thread.
+		 */
+		wake_up_process(default_backing_dev_info.wb.task);
+	}
+	spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+
 /*
  * Calculate the longest interval (jiffies) bdi threads are allowed to be
  * inactive.
@@ -353,8 +379,10 @@ static int bdi_forker_thread(void *ptr)
 		 * Temporary measure, we want to make sure we don't see
 		 * dirty data on the default backing_dev_info
 		 */
-		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+			del_timer(&me->wakeup_timer);
 			wb_do_writeback(me, 0);
+		}
 
 		spin_lock_bh(&bdi_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -386,7 +414,7 @@ static int bdi_forker_thread(void *ptr)
 				break;
 			}
 
-			spin_lock(&bdi->wb_lock);
+			spin_lock_bh(&bdi->wb_lock);
 			/*
 			 * If there is no work to do and the bdi thread was
 			 * inactive long enough - kill it. The wb_lock is taken
@@ -403,7 +431,7 @@ static int bdi_forker_thread(void *ptr)
 				action = KILL_THREAD;
 				break;
 			}
-			spin_unlock(&bdi->wb_lock);
+			spin_unlock_bh(&bdi->wb_lock);
 		}
 		spin_unlock_bh(&bdi_lock);
 
@@ -427,9 +455,9 @@ static int bdi_forker_thread(void *ptr)
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
 				 */
-				spin_lock(&bdi->wb_lock);
+				spin_lock_bh(&bdi->wb_lock);
 				bdi->wb.task = task;
-				spin_unlock(&bdi->wb_lock);
+				spin_unlock_bh(&bdi->wb_lock);
 			}
 			break;
 
@@ -586,6 +614,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
 	if (bdi->dev) {
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
+		del_timer_sync(&bdi->wb.wakeup_timer);
 
 		if (!bdi_cap_flush_forker(bdi))
 			bdi_wb_shutdown(bdi);
@@ -596,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_unregister);
 
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+	memset(wb, 0, sizeof(*wb));
+
+	wb->bdi = bdi;
+	wb->last_old_flush = jiffies;
+	INIT_LIST_HEAD(&wb->b_dirty);
+	INIT_LIST_HEAD(&wb->b_io);
+	INIT_LIST_HEAD(&wb->b_more_io);
+	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
-- 
cgit v1.2.3-70-g09d2


From a4ffdde6e56fdf8c34ddadc2674d6eb978083369 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jun 2010 17:38:30 -0400
Subject: simplify checks for I_CLEAR/I_FREEING

add I_CLEAR instead of replacing I_FREEING with it.  I_CLEAR is
equivalent to I_FREEING for almost all code looking at either;
it's there to keep track of having called clear_inode() exactly
once per inode lifetime, at some point after having set I_FREEING.
I_CLEAR and I_FREEING never get set at the same time with the
current code, so we can switch to setting i_flags to I_FREEING | I_CLEAR
instead of I_CLEAR without loss of information.  As the result of
such change, checks become simpler and the amount of code that needs
to know about I_CLEAR shrinks a lot.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c            |  2 +-
 fs/drop_caches.c            |  2 +-
 fs/fs-writeback.c           |  8 ++++----
 fs/gfs2/inode.c             |  2 +-
 fs/inode.c                  | 16 ++++++++--------
 fs/nilfs2/gcdat.c           |  2 +-
 fs/notify/inode_mark.c      |  6 +++---
 fs/notify/inotify/inotify.c |  7 +++----
 fs/quota/dquot.c            |  2 +-
 fs/xfs/linux-2.6/xfs_iops.c |  4 ++--
 include/linux/fs.h          |  4 ++--
 11 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7f9e0536db1..95eac011696 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3860,7 +3860,7 @@ again:
 			p = &parent->rb_right;
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
-				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+				  (I_WILL_FREE | I_FREEING)));
 			rb_erase(parent, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 83c4f600786..2195c213ab2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		if (inode->i_mapping->nrpages == 0)
 			continue;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac9..7608880b5c5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -352,7 +352,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
-	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
+	if (!(inode->i_state & I_FREEING)) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
 			/*
 			 * More pages get dirtied by a fast dirtier.
@@ -499,7 +499,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		if (inode_dirtied_after(inode, wbc->wb_start))
 			return 1;
 
-		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
+		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
@@ -935,7 +935,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			if (hlist_unhashed(&inode->i_hash))
 				goto out;
 		}
-		if (inode->i_state & (I_FREEING|I_CLEAR))
+		if (inode->i_state & I_FREEING)
 			goto out;
 
 		/*
@@ -1001,7 +1001,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		mapping = inode->i_mapping;
 		if (mapping->nrpages == 0)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6c023a3b5d2..08140f185a3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -84,7 +84,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_skip_data *data = opaque;
 
 	if (ip->i_no_addr == data->no_addr) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
 		}
diff --git a/fs/inode.c b/fs/inode.c
index 722860b323a..71fe079ca32 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -317,7 +317,7 @@ void clear_inode(struct inode *inode)
 		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
-	inode->i_state = I_CLEAR;
+	inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(clear_inode);
 
@@ -553,7 +553,7 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
@@ -578,7 +578,7 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
@@ -840,7 +840,7 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)))
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
 		__iget(inode);
 	else
 		/*
@@ -1089,7 +1089,7 @@ int insert_inode_locked(struct inode *inode)
 				continue;
 			if (old->i_sb != sb)
 				continue;
-			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			if (old->i_state & (I_FREEING|I_WILL_FREE))
 				continue;
 			break;
 		}
@@ -1128,7 +1128,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 				continue;
 			if (!test(old, data))
 				continue;
-			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			if (old->i_state & (I_FREEING|I_WILL_FREE))
 				continue;
 			break;
 		}
@@ -1218,7 +1218,7 @@ void generic_delete_inode(struct inode *inode)
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode_lock);
 	wake_up_inode(inode);
-	BUG_ON(inode->i_state != I_CLEAR);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	destroy_inode(inode);
 }
 EXPORT_SYMBOL(generic_delete_inode);
@@ -1322,7 +1322,7 @@ static inline void iput_final(struct inode *inode)
 void iput(struct inode *inode)
 {
 	if (inode) {
-		BUG_ON(inode->i_state == I_CLEAR);
+		BUG_ON(inode->i_state & I_CLEAR);
 
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
 			iput_final(inode);
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index dd5f7e0a95f..84a45d1d546 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -78,7 +78,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
 	struct inode *gcdat = nilfs->ns_gc_dat;
 	struct nilfs_inode_info *gii = NILFS_I(gcdat);
 
-	gcdat->i_state = I_CLEAR;
+	gcdat->i_state = I_FREEING | I_CLEAR;
 	gii->i_flags = 0;
 
 	nilfs_palloc_clear_cache(gcdat);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c..152b83ec005 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -369,11 +369,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		struct inode *need_iput_tmp;
 
 		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * We cannot __iget() an inode in state I_FREEING,
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 
 		/*
@@ -397,7 +397,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
 		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+		    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
 			__iget(next_i);
 			need_iput = next_i;
 		}
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 27b75ebc746..cf6b0429a25 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -377,11 +377,11 @@ void inotify_unmount_inodes(struct list_head *list)
 		struct list_head *watches;
 
 		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * We cannot __iget() an inode in state I_FREEING,
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 
 		/*
@@ -403,8 +403,7 @@ void inotify_unmount_inodes(struct list_head *list)
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
 				atomic_read(&next_i->i_count) &&
-				!(next_i->i_state & (I_CLEAR | I_FREEING |
-					I_WILL_FREE))) {
+				!(next_i->i_state & (I_FREEING|I_WILL_FREE))) {
 			__iget(next_i);
 			need_iput = next_i;
 		}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 437d2ca2de9..5cec3e2348f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -885,7 +885,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 62dd349face..68be25dcd30 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -80,7 +80,7 @@ xfs_mark_inode_dirty_sync(
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
 		mark_inode_dirty_sync(inode);
 }
 
@@ -90,7 +90,7 @@ xfs_mark_inode_dirty(
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
 		mark_inode_dirty(inode);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5547b1b027d..218693d8d44 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1616,8 +1616,8 @@ struct super_operations {
  * I_FREEING		Set when inode is about to be freed but still has dirty
  *			pages or buffers attached or the inode itself is still
  *			dirty.
- * I_CLEAR		Set by clear_inode().  In this state the inode is clean
- *			and can be destroyed.
+ * I_CLEAR		Added by clear_inode().  In this state the inode is clean
+ *			and can be destroyed.  Inode keeps I_FREEING.
  *
  *			Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
  *			prohibited for many purposes.  iget() must wait for
-- 
cgit v1.2.3-70-g09d2


From 7624ee72aa09334af072853457a5d46d9901c3f8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 9 Aug 2010 17:20:03 -0700
Subject: mm: avoid resetting wb_start after each writeback round

WB_SYNC_NONE writeback is done in rounds of 1024 pages so that we don't
write out some huge inode for too long while starving writeout of other
inodes.  To avoid livelocks, we record time we started writeback in
wbc->wb_start and do not write out inodes which were dirtied after this
time.  But currently, writeback_inodes_wb() resets wb_start each time it
is called thus effectively invalidating this logic and making any
WB_SYNC_NONE writeback prone to livelocks.

This patch makes sure wb_start is set only once when we start writeback.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac9..30ac305e829 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -530,7 +530,8 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 {
 	int ret = 0;
 
-	wbc->wb_start = jiffies; /* livelock avoidance */
+	if (!wbc->wb_start)
+		wbc->wb_start = jiffies; /* livelock avoidance */
 	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
@@ -559,7 +560,6 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	wbc->wb_start = jiffies; /* livelock avoidance */
 	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
@@ -625,6 +625,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.range_end = LLONG_MAX;
 	}
 
+	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
-- 
cgit v1.2.3-70-g09d2


From 16c4042f08919f447d6b2a55679546c9b97c7264 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 11 Aug 2010 14:17:39 -0700
Subject: writeback: avoid unnecessary calculation of bdi dirty thresholds

Split get_dirty_limits() into global_dirty_limits()+bdi_dirty_limit(), so
that the latter can be avoided when under global dirty background
threshold (which is the normal state for most systems).

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c         |  2 +-
 include/linux/writeback.h |  5 ++--
 mm/backing-dev.c          |  3 +-
 mm/page-writeback.c       | 75 ++++++++++++++++++++++++-----------------------
 4 files changed, 44 insertions(+), 41 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2f76c4a081a..fca43d4d7bf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -590,7 +590,7 @@ static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
 
-	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+	global_dirty_limits(&background_thresh, &dirty_thresh);
 
 	return (global_page_state(NR_FILE_DIRTY) +
 		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c24eca71e80..72a5d647a5f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -124,8 +124,9 @@ struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
-void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
-		      unsigned long *pbdi_dirty, struct backing_dev_info *bdi);
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
+			       unsigned long dirty);
 
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 08d357522e7..eaa4a5bbe06 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -81,7 +81,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		nr_more_io++;
 	spin_unlock(&inode_lock);
 
-	get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cf69a5e46e..1ea13ef350a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -267,10 +267,11 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  *
  *   dirty -= (dirty/8) * p_{t}
  */
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+				       unsigned long bdi_dirty)
 {
 	long numerator, denominator;
-	unsigned long dirty = *pdirty;
+	unsigned long dirty = bdi_dirty;
 	u64 inv = dirty >> 3;
 
 	task_dirties_fraction(tsk, &numerator, &denominator);
@@ -278,10 +279,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 	do_div(inv, denominator);
 
 	dirty -= inv;
-	if (dirty < *pdirty/2)
-		dirty = *pdirty/2;
 
-	*pdirty = dirty;
+	return max(dirty, bdi_dirty/2);
 }
 
 /*
@@ -391,9 +390,7 @@ unsigned long determine_dirtyable_memory(void)
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
-		 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
 	unsigned long background;
 	unsigned long dirty;
@@ -425,26 +422,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+}
 
-	if (bdi) {
-		u64 bdi_dirty;
-		long numerator, denominator;
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
+			       unsigned long dirty)
+{
+	u64 bdi_dirty;
+	long numerator, denominator;
 
-		/*
-		 * Calculate this BDI's share of the dirty ratio.
-		 */
-		bdi_writeout_fraction(bdi, &numerator, &denominator);
+	/*
+	 * Calculate this BDI's share of the dirty ratio.
+	 */
+	bdi_writeout_fraction(bdi, &numerator, &denominator);
 
-		bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-		bdi_dirty *= numerator;
-		do_div(bdi_dirty, denominator);
-		bdi_dirty += (dirty * bdi->min_ratio) / 100;
-		if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-			bdi_dirty = dirty * bdi->max_ratio / 100;
+	bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+	bdi_dirty *= numerator;
+	do_div(bdi_dirty, denominator);
 
-		*pbdi_dirty = bdi_dirty;
-		task_dirty_limit(current, pbdi_dirty);
-	}
+	bdi_dirty += (dirty * bdi->min_ratio) / 100;
+	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+		bdi_dirty = dirty * bdi->max_ratio / 100;
+
+	return bdi_dirty;
 }
 
 /*
@@ -475,13 +474,24 @@ static void balance_dirty_pages(struct address_space *mapping,
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh,
-				&bdi_thresh, bdi);
-
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
+		global_dirty_limits(&background_thresh, &dirty_thresh);
+
+		/*
+		 * Throttle it only when the background writeback cannot
+		 * catch-up. This avoids (excessively) small writeouts
+		 * when the bdi limits are ramping up.
+		 */
+		if (nr_reclaimable + nr_writeback <
+				(background_thresh + dirty_thresh) / 2)
+			break;
+
+		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
 		 * to ensure we accurately count the 'dirty' pages when
@@ -513,15 +523,6 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (!dirty_exceeded)
 			break;
 
-		/*
-		 * Throttle it only when the background writeback cannot
-		 * catch-up. This avoids (excessively) small writeouts
-		 * when the bdi limits are ramping up.
-		 */
-		if (nr_reclaimable + nr_writeback <
-				(background_thresh + dirty_thresh) / 2)
-			break;
-
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 
@@ -634,7 +635,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 	unsigned long dirty_thresh;
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+		global_dirty_limits(&background_thresh, &dirty_thresh);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
-- 
cgit v1.2.3-70-g09d2


From 23539afc71937dbaca7de2229669f4475ff4ea7b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 11 Aug 2010 14:17:41 -0700
Subject: writeback: don't redirty tail an inode with dirty pages

Avoid delaying writeback for an expire inode with lots of dirty pages, but
no active dirtier at the moment.  Previously we only do that for the
kupdate case.

Any filesystem that does delayed allocation or unwritten extent conversion
after IO completion will cause this - for example, XFS.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fca43d4d7bf..1ce364bbb00 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -363,18 +363,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
-		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
-			/*
-			 * More pages get dirtied by a fast dirtier.
-			 */
-			goto select_queue;
-		} else if (inode->i_state & I_DIRTY) {
-			/*
-			 * At least XFS will redirty the inode during the
-			 * writeback (delalloc) and on io completion (isize).
-			 */
-			redirty_tail(inode);
-		} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
 			 * sometimes bales out without doing anything. Redirty
@@ -396,7 +385,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 				 * soon as the queue becomes uncongested.
 				 */
 				inode->i_state |= I_DIRTY_PAGES;
-select_queue:
 				if (wbc->nr_to_write <= 0) {
 					/*
 					 * slice used up: queue for next turn
@@ -419,6 +407,14 @@ select_queue:
 				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
+		} else if (inode->i_state & I_DIRTY) {
+			/*
+			 * Filesystems can dirty the inode during writeback
+			 * operations, such as delayed allocation during
+			 * submission or metadata updates after data IO
+			 * completion.
+			 */
+			redirty_tail(inode);
 		} else if (atomic_read(&inode->i_count)) {
 			/*
 			 * The inode is clean, inuse
-- 
cgit v1.2.3-70-g09d2


From 4ea879b96d437693485d21f4b7e1eb72f7615fc2 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 11 Aug 2010 14:17:42 -0700
Subject: writeback: fix queue_io() ordering

This was not a bug, since b_io is empty for kupdate writeback.  The next
patch will do requeue_io() for non-kupdate writeback, so let's fix it.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1ce364bbb00..863bfb0eb49 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -249,10 +249,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 
 /*
  * Queue all expired dirty inodes for io, eldest first.
+ * Before
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    gf         edc     BA
+ * After
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    g          fBAedc
+ *                                           |
+ *                                           +--> dequeue for IO
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-	list_splice_init(&wb->b_more_io, wb->b_io.prev);
+	list_splice_init(&wb->b_more_io, &wb->b_io);
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 
-- 
cgit v1.2.3-70-g09d2


From a50aeb40144982eb766053309b6fc33e14ca46f0 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 11 Aug 2010 14:17:43 -0700
Subject: writeback: merge for_kupdate and !for_kupdate cases

Unify the logic for kupdate and non-kupdate cases.  There won't be
starvation because the inodes requeued into b_more_io will later be
spliced _after_ the remaining inodes in b_io, hence won't stand in the way
of other inodes in the next run.

It avoids unnecessary redirty_tail() calls, hence the update of
i_dirtied_when.  The timestamp update is undesirable because it could
later delay the inode's periodic writeback, or may exclude the inode from
the data integrity sync operation (which checks timestamp to avoid extra
work and livelock).

===
How the redirty_tail() comes about:

It was a long story..  This redirty_tail() was introduced with
wbc.more_io.  The initial patch for more_io actually does not have the
redirty_tail(), and when it's merged, several 100% iowait bug reports
arised:

reiserfs:
        http://lkml.org/lkml/2007/10/23/93

jfs:
        commit 29a424f28390752a4ca2349633aaacc6be494db5
        JFS: clear PAGECACHE_TAG_DIRTY for no-write pages

ext2:
        http://www.spinics.net/linux/lists/linux-ext4/msg04762.html

They are all old bugs hidden in various filesystems that become "visible"
with the more_io patch.  At the time, the ext2 bug is thought to be
"trivial", so not fixed.  Instead the following updated more_io patch with
redirty_tail() is merged:

	http://www.spinics.net/linux/lists/linux-ext4/msg04507.html

This will in general prevent 100% on ext2 and possibly other unknown FS bugs.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 43 ++++++++++---------------------------------
 1 file changed, 10 insertions(+), 33 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 863bfb0eb49..8a5807d2fb9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -374,45 +374,22 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
-			 * sometimes bales out without doing anything. Redirty
-			 * the inode; Move it from b_io onto b_more_io/b_dirty.
+			 * sometimes bales out without doing anything.
 			 */
-			/*
-			 * akpm: if the caller was the kupdate function we put
-			 * this inode at the head of b_dirty so it gets first
-			 * consideration.  Otherwise, move it to the tail, for
-			 * the reasons described there.  I'm not really sure
-			 * how much sense this makes.  Presumably I had a good
-			 * reasons for doing it this way, and I'd rather not
-			 * muck with it at present.
-			 */
-			if (wbc->for_kupdate) {
+			inode->i_state |= I_DIRTY_PAGES;
+			if (wbc->nr_to_write <= 0) {
 				/*
-				 * For the kupdate function we move the inode
-				 * to b_more_io so it will get more writeout as
-				 * soon as the queue becomes uncongested.
+				 * slice used up: queue for next turn
 				 */
-				inode->i_state |= I_DIRTY_PAGES;
-				if (wbc->nr_to_write <= 0) {
-					/*
-					 * slice used up: queue for next turn
-					 */
-					requeue_io(inode);
-				} else {
-					/*
-					 * somehow blocked: retry later
-					 */
-					redirty_tail(inode);
-				}
+				requeue_io(inode);
 			} else {
 				/*
-				 * Otherwise fully redirty the inode so that
-				 * other inodes on this superblock will get some
-				 * writeout.  Otherwise heavy writing to one
-				 * file would indefinitely suspend writeout of
-				 * all the other files.
+				 * Writeback blocked by something other than
+				 * congestion. Delay the inode for some time to
+				 * avoid spinning on the CPU (100% iowait)
+				 * retrying writeback of the dirty page/inode
+				 * that cannot be performed immediately.
 				 */
-				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
 		} else if (inode->i_state & I_DIRTY) {
-- 
cgit v1.2.3-70-g09d2


From 81d73a32d775ae9674ea6edf0b5b721fc3bc57d9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Aug 2010 14:17:44 -0700
Subject: mm: fix writeback_in_progress()

Commit 83ba7b071f3 ("writeback: simplify the write back thread queue")
broke writeback_in_progress() as in that commit we started to remove work
items from the list at the moment we start working on them and not at the
moment they are finished.  Thus if the flusher thread was doing some work
but there was no other work queued, writeback_in_progress() returned
false.  This could in particular cause unnecessary queueing of background
writeback from balance_dirty_pages() or writeout work from
writeback_sb_if_idle().

This patch fixes the problem by introducing a bit in the bdi state which
indicates that the flusher thread is processing some work and uses this
bit for writeback_in_progress() test.

NOTE: Both callsites of writeback_in_progress() (namely,
writeback_inodes_sb_if_idle() and balance_dirty_pages()) would actually
need a different information than what writeback_in_progress() provides.
They would need to know whether *the kind of writeback they are going to
submit* is already queued.  But this information isn't that simple to
provide so let's fix writeback_in_progress() for the time being.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c           | 4 +++-
 include/linux/backing-dev.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8a5807d2fb9..7d9d06ba184 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -68,7 +68,7 @@ int nr_pdflush_threads;
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-	return !list_empty(&bdi->work_list);
+	return test_bit(BDI_writeback_running, &bdi->state);
 }
 
 static void bdi_queue_work(struct backing_dev_info *bdi,
@@ -740,6 +740,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
+	set_bit(BDI_writeback_running, &wb->bdi->state);
 	while ((work = get_next_work_item(bdi)) != NULL) {
 		/*
 		 * Override sync mode, in case we must wait for completion
@@ -766,6 +767,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7628219e538..35b00746c71 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -31,6 +31,7 @@ enum bdi_state {
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
 	BDI_registered,		/* bdi_register() was done */
+	BDI_writeback_running,	/* Writeback is in progress */
 	BDI_unused,		/* Available bits start here */
 };
 
-- 
cgit v1.2.3-70-g09d2