From 573412b29586e58477adb70e022193a337763319 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Sun, 6 Dec 2009 11:48:52 +0100
Subject: cfq-iosched: reduce write depth only if sync was delayed

The introduction of ramp-up formula for async queue depths has
slowed down dirty page reclaim, by reducing async write performance.
This patch makes sure the formula kicks in only when sync request
was recently delayed.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cfb0b2f5f63..5009af490a0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -283,7 +283,7 @@ struct cfq_data {
 	 */
 	struct cfq_queue oom_cfqq;
 
-	unsigned long last_end_sync_rq;
+	unsigned long last_delayed_sync;
 
 	/* List of cfq groups being managed on this device*/
 	struct hlist_head cfqg_list;
@@ -2264,7 +2264,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * based on the last sync IO we serviced
 	 */
 	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
-		unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
+		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
 		unsigned int depth;
 
 		depth = last_sync / cfqd->cfq_slice[1];
@@ -3273,7 +3273,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 
 	if (sync) {
 		RQ_CIC(rq)->last_end_request = now;
-		cfqd->last_end_sync_rq = now;
+		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
+			cfqd->last_delayed_sync = now;
 	}
 
 	/*
@@ -3711,7 +3712,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
-	cfqd->last_end_sync_rq = jiffies;
+	cfqd->last_delayed_sync = jiffies - HZ;
 	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
 }
-- 
cgit v1.2.3-70-g09d2


From b9d8f4c73b1af4cfd53f819bf84c2bce31232275 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 8 Dec 2009 08:54:17 +0100
Subject: cfq: Optimization for close cooperating queue searching

It doesn't make any sense to try to find out a close cooperating
queue if current cfqq is the only one in the group.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5009af490a0..b19cd684bf1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1749,6 +1749,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	if (CFQQ_SEEKY(cur_cfqq))
 		return NULL;
 
+	/*
+	 * Don't search priority tree if it's the only queue in the group.
+	 */
+	if (cur_cfqq->cfqg->nr_cfqq == 1)
+		return NULL;
+
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
-- 
cgit v1.2.3-70-g09d2


From c244bb50a9baa2ec47a458bbafb36b5e559ed5fa Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 8 Dec 2009 17:52:57 -0500
Subject: cfq-iosched: Get rid of cfqq wait_busy_done flag

o Get rid of wait_busy_done flag. This flag only tells we were doing wait
  busy on a queue and that queue got request so expire it. That information
  can easily be obtained by (cfq_cfqq_wait_busy() && queue_is_not_empty). So
  remove this flag and keep code simple.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b19cd684bf1..f41fdb5f3e0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -319,7 +319,6 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
-	CFQ_CFQQ_FLAG_wait_busy_done,	/* Got new request. Expire the queue */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -348,7 +347,6 @@ CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
-CFQ_CFQQ_FNS(wait_busy_done);
 #undef CFQ_CFQQ_FNS
 
 #ifdef CONFIG_DEBUG_CFQ_IOSCHED
@@ -1574,7 +1572,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
-	cfq_clear_cfqq_wait_busy_done(cfqq);
 
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
@@ -2134,11 +2131,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 
 	if (!cfqd->rq_queued)
 		return NULL;
+
+	/*
+	 * We were waiting for group to get backlogged. Expire the queue
+	 */
+	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
+		goto expire;
+
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
-	if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq))
-	     && !cfq_cfqq_must_dispatch(cfqq))
+	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
 		goto expire;
 
 	/*
@@ -3171,10 +3174,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
-		if (cfq_cfqq_wait_busy(cfqq)) {
-			cfq_clear_cfqq_wait_busy(cfqq);
-			cfq_mark_cfqq_wait_busy_done(cfqq);
-		}
 		/*
 		 * Remember that we saw a request from this process, but
 		 * don't start queuing just yet. Otherwise we risk seeing lots
-- 
cgit v1.2.3-70-g09d2


From 7667aa0630407bc07dc38dcc79d29cc0a65553c1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 8 Dec 2009 17:52:58 -0500
Subject: cfq-iosched: Take care of corner cases of group losing share due to
 deletion

If there is a sequential reader running in a group, we wait for next request
to come in that group after slice expiry and once new request is in, we expire
the queue. Otherwise we delete the group from service tree and group looses
its fair share.

So far I was marking a queue as wait_busy if it had consumed its slice and
it was last queue in the group. But this condition did not cover following
two cases.

1.If a request completed and slice has not expired yet. Next request comes
  in and is dispatched to disk. Now select_queue() hits and slice has expired.
  This group will be deleted. Because request is still in the disk, this queue
  will never get a chance to wait_busy.

2.If request completed and slice has not expired yet. Before next request
  comes in (delay due to think time), select_queue() hits and expires the
  queue hence group. This queue never got a chance to wait busy.

Gui was hitting the boundary condition 1 and not getting fairness numbers
proportional to weight.

This patch puts the checks for above two conditions and improves the fairness
numbers for sequential workload on rotational media. Check in select_queue()
takes care of case 1 and additional check in should_wait_busy() takes care
of case 2.

Reported-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f41fdb5f3e0..98b15b98b85 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2141,8 +2141,22 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
-	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
-		goto expire;
+	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
+		/*
+		 * If slice had not expired at the completion of last request
+		 * we might not have turned on wait_busy flag. Don't expire
+		 * the queue yet. Allow the group to get backlogged.
+		 *
+		 * The very fact that we have used the slice, that means we
+		 * have been idling all along on this queue and it should be
+		 * ok to wait for this request to complete.
+		 */
+		if (cfqq->cfqg->nr_cfqq == 1 && cfqq->dispatched
+		    && cfq_should_idle(cfqd, cfqq))
+			goto keep_queue;
+		else
+			goto expire;
+	}
 
 	/*
 	 * The active queue has requests and isn't expired, allow it to
@@ -3256,6 +3270,35 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 		cfqd->hw_tag = 0;
 }
 
+static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct cfq_io_context *cic = cfqd->active_cic;
+
+	/* If there are other queues in the group, don't wait */
+	if (cfqq->cfqg->nr_cfqq > 1)
+		return false;
+
+	if (cfq_slice_used(cfqq))
+		return true;
+
+	/* if slice left is less than think time, wait busy */
+	if (cic && sample_valid(cic->ttime_samples)
+	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
+		return true;
+
+	/*
+	 * If think times is less than a jiffy than ttime_mean=0 and above
+	 * will not be true. It might happen that slice has not expired yet
+	 * but will expire soon (4-5 ns) during select_queue(). To cover the
+	 * case where think time is less than a jiffy, mark the queue wait
+	 * busy if only 1 jiffy is left in the slice.
+	 */
+	if (cfqq->slice_end - jiffies == 1)
+		return true;
+
+	return false;
+}
+
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -3295,11 +3338,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 
 		/*
-		 * If this queue consumed its slice and this is last queue
-		 * in the group, wait for next request before we expire
-		 * the queue
+		 * Should we wait for next request to come in before we expire
+		 * the queue.
 		 */
-		if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) {
+		if (cfq_should_wait_busy(cfqd, cfqq)) {
 			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
 			cfq_mark_cfqq_wait_busy(cfqq);
 		}
-- 
cgit v1.2.3-70-g09d2


From edc71131c4dc6cc73e2a24aa0a7a79cfce738f12 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Wed, 9 Dec 2009 20:56:04 +0100
Subject: cfq-iosched: commenting non-obvious initialization

Added a comment to explain the initialization of last_delayed_sync.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 98b15b98b85..69ecee7f4ad 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3759,6 +3759,10 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
+	/*
+	 * we optimistically start assuming sync ops weren't delayed in last
+	 * second, in order to have larger depth for async operations.
+	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
 	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
-- 
cgit v1.2.3-70-g09d2


From 554554f60ad619e1efab01897208bc320b81d9da Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Thu, 10 Dec 2009 09:38:39 +0100
Subject: cfq: Remove wait_request flag when idle time is being deleted

Remove wait_request flag when idle time is being deleted, otherwise
it'll hit this path every time when a request is enqueued.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 69ecee7f4ad..96f59ae5b6e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3202,6 +3202,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
+				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else
 				cfq_mark_cfqq_must_dispatch(cfqq);
-- 
cgit v1.2.3-70-g09d2


From 82bbbf28db4beefcd8b897800153e21378270cd1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 10 Dec 2009 19:25:41 +0100
Subject: Fix a CFQ crash in "for-2.6.33" branch of block tree

I think my previous patch introduced a bug which can lead to CFQ hitting
BUG_ON().

The offending commit in for-2.6.33 branch is.

commit 7667aa0630407bc07dc38dcc79d29cc0a65553c1
Author: Vivek Goyal <vgoyal@redhat.com>
Date:   Tue Dec 8 17:52:58 2009 -0500

    cfq-iosched: Take care of corner cases of group losing share due to deletion

While doing some stress testing on my box, I enountered following.

login: [ 3165.148841] BUG: scheduling while
atomic: swapper/0/0x10000100
[ 3165.149821] Modules linked in: cfq_iosched dm_multipath qla2xxx igb
scsi_transport_fc dm_snapshot [last unloaded: scsi_wait_scan]
[ 3165.149821] Pid: 0, comm: swapper Not tainted
2.6.32-block-for-33-merged-new #3
[ 3165.149821] Call Trace:
[ 3165.149821]  <IRQ>  [<ffffffff8103fab8>] __schedule_bug+0x5c/0x60
[ 3165.149821]  [<ffffffff8103afd7>] ? __wake_up+0x44/0x4d
[ 3165.149821]  [<ffffffff8153a979>] schedule+0xe3/0x7bc
[ 3165.149821]  [<ffffffff8103a796>] ? cpumask_next+0x1d/0x1f
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff810422d8>] __cond_resched+0x2a/0x35
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff8153b1ee>] _cond_resched+0x2c/0x37
[ 3165.149821]  [<ffffffff8100e2db>] is_valid_bugaddr+0x16/0x2f
[ 3165.149821]  [<ffffffff811e4161>] report_bug+0x18/0xac
[ 3165.149821]  [<ffffffff8100f1fc>] die+0x39/0x63
[ 3165.149821]  [<ffffffff8153cde1>] do_trap+0x11a/0x129
[ 3165.149821]  [<ffffffff8100d470>] do_invalid_op+0x96/0x9f
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff81034b4d>] ? enqueue_task+0x5c/0x67
[ 3165.149821]  [<ffffffff8103ae83>] ? task_rq_unlock+0x11/0x13
[ 3165.149821]  [<ffffffff81041aae>] ? try_to_wake_up+0x292/0x2a4
[ 3165.149821]  [<ffffffff8100c935>] invalid_op+0x15/0x20
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
[ 3165.149821]  [<ffffffff811d8c2a>] blk_peek_request+0x191/0x1a7
[ 3165.149821]  [<ffffffff811e5b8d>] ? kobject_get+0x1a/0x21
[ 3165.149821]  [<ffffffff812c8d4c>] scsi_request_fn+0x82/0x3df
[ 3165.149821]  [<ffffffff8110b2de>] ? bio_fs_destructor+0x15/0x17
[ 3165.149821]  [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
[ 3165.149821]  [<ffffffff811d931f>] __blk_run_queue+0x42/0x71
[ 3165.149821]  [<ffffffff811d9403>] blk_run_queue+0x26/0x3a
[ 3165.149821]  [<ffffffff812c8761>] scsi_run_queue+0x2de/0x375
[ 3165.149821]  [<ffffffff812b60ac>] ? put_device+0x17/0x19
[ 3165.149821]  [<ffffffff812c92d7>] scsi_next_command+0x3b/0x4b
[ 3165.149821]  [<ffffffff812c9b9f>] scsi_io_completion+0x1c9/0x3f5
[ 3165.149821]  [<ffffffff812c3c36>] scsi_finish_command+0xb5/0xbe

I think I have hit following BUG_ON() in cfq_dispatch_request().

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

Please find attached the patch to fix it. I have done some stress testing
with it and have not seen it happening again.

o We should wait on a queue even after slice expiry only if it is empty. If
  queue is not empty then continue to expire it.

o If we decide to keep the queue then make cfqq=NULL. Otherwise select_queue()
  will return a valid cfqq and cfq_dispatch_request() can hit following
  BUG_ON().

  BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list))

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 96f59ae5b6e..f3f62394b98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2151,10 +2151,11 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		 * have been idling all along on this queue and it should be
 		 * ok to wait for this request to complete.
 		 */
-		if (cfqq->cfqg->nr_cfqq == 1 && cfqq->dispatched
-		    && cfq_should_idle(cfqd, cfqq))
+		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
+		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+			cfqq = NULL;
 			goto keep_queue;
-		else
+		} else
 			goto expire;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 66ae291978177d5c012015f12b8fbc76dc7d0965 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 15 Dec 2009 10:08:45 +0100
Subject: cfq: set workload as expired if it doesn't have any slice left

When a group is resumed, if it doesn't have workload slice left,
we should set workload_expires as expired. Otherwise, we might
start from where we left in previous group by error.
Thanks the idea from Corrado.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3f62394b98..e2f80463ed0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2113,7 +2113,9 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 		cfqd->serving_type = cfqg->saved_workload;
 		cfqd->serving_prio = cfqg->saved_serving_prio;
-	}
+	} else
+		cfqd->workload_expires = jiffies - 1;
+
 	choose_service_tree(cfqd, cfqg);
 }
 
-- 
cgit v1.2.3-70-g09d2


From b568be627a7270eba575bc4406a606e1545f91bb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Dec 2009 09:16:41 +0100
Subject: block: temporarily disable discard granularity

Commit 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d adds a check for
misaligned stacking offsets, but it's buggy since the defaults are 0.
Hence all dm devices that pass in a non-zero starting offset will
be marked as misaligned amd dm will complain.

A real fix is coming, in the mean time disable the discard granularity
check so that users don't worry about dm reporting about misaligned
devices.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index dd1f1e0e196..6ae118d6e19 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -554,11 +554,18 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		ret = -1;
 	}
 
+	/*
+	 * Temporarily disable discard granularity. It's currently buggy
+	 * since we default to 0 for discard_granularity, hence this
+	 * "failure" will always trigger for non-zero offsets.
+	 */
+#if 0
 	if (offset &&
 	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
 		t->discard_misaligned = 1;
 		ret = -1;
 	}
+#endif
 
 	/* If top has no alignment offset, inherit from bottom */
 	if (!t->alignment_offset)
-- 
cgit v1.2.3-70-g09d2


From 1db32c40600437c5e049796bd32f49f61244c6ef Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 16 Dec 2009 17:52:57 -0500
Subject: cfq-iosched: Remove the check for same cfq group from allow_merge

o allow_merge() already checks if submitting task is pointing to same cfqq
  as rq has been queued in. If everything is fine, we should not be having
  a task in one cgroup and having a pointer to cfqq in other cgroup.

  Well I guess in some situations it can happen and that is, when a random
  IO queue has been moved into root cgroup for group_isolation=0. In
  this case, tasks's cgroup/group is different from where actually cfqq is,
  but this is intentional and in this case merging should be allowed.

  The second situation is where due to close cooperator patches, multiple
  processes can be sharing a cfqq. If everything implemented right, we should
  not end up in a situation where tasks from different processes in different
  groups are sharing the same cfqq as we allow merging of cooperating queues
  only if they are in same group.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e2f80463ed0..a0e5347767d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1513,9 +1513,6 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 
-	/* Deny merge if bio and rq don't belong to same cfq group */
-	if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0))
-		return false;
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
-- 
cgit v1.2.3-70-g09d2


From fb104db41e6e006c85ce1097f372cd1e10c1755c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 16 Dec 2009 17:52:58 -0500
Subject: cfq-iosched: Get rid of nr_groups

o Currently code does not seem to be using cfqd->nr_groups. Get rid of it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a0e5347767d..d9bfa09e68c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -208,8 +208,6 @@ struct cfq_data {
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
-	/* Number of active cfq groups on group service tree */
-	int nr_groups;
 
 	/*
 	 * The priority currently being served
@@ -842,7 +840,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	__cfq_group_service_tree_add(st, cfqg);
 	cfqg->on_st = true;
-	cfqd->nr_groups++;
 	st->total_weight += cfqg->weight;
 }
 
@@ -863,7 +860,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfqg->on_st = false;
-	cfqd->nr_groups--;
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
-- 
cgit v1.2.3-70-g09d2


From 65b32a573eefa1cdd3cbe5ea59326308e6c3b9ad Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 16 Dec 2009 17:52:59 -0500
Subject: cfq-iosched: Remove prio_change logic for workload selection

o CFQ now internally divides cfq queues in therr workload categories. sync-idle,
  sync-noidle and async. Which workload to run depends primarily on rb_key
  offset across three service trees. Which is a combination of mulitiple things
  including what time queue got queued on the service tree.

  There is one exception though. That is if we switched the prio class, say
  we served some RT tasks and again started serving BE class, then with-in
  BE class we always started with sync-noidle workload irrespective of rb_key
  offset in service trees.

  This can provide better latencies for sync-noidle workload in the presence
  of RT tasks.

o This patch gets rid of that exception and which workload to run with-in
  class always depends on lowest rb_key across service trees. The reason
  being that now we have multiple BE class groups and if we always switch
  to sync-noidle workload with-in group, we can potentially starve a sync-idle
  workload with-in group. Same is true for async workload which will be in
  root group. Also the workload-switching with-in group will become very
  unpredictable as it now depends whether some RT workload was running in
  the system or not.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 48 ++++++++++++------------------------------------
 1 file changed, 12 insertions(+), 36 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d9bfa09e68c..8df4fe58f4e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -292,8 +292,7 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
-					    enum wl_type_t type,
-					    struct cfq_data *cfqd)
+					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
@@ -1146,7 +1145,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 #endif
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
-						cfqq_type(cfqq), cfqd);
+						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
@@ -1609,7 +1608,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
 		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
-					cfqd->serving_type, cfqd);
+					cfqd->serving_type);
 
 	if (!cfqd->rq_queued)
 		return NULL;
@@ -1956,8 +1955,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 }
 
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
-				struct cfq_group *cfqg, enum wl_prio_t prio,
-				bool prio_changed)
+				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_queue *queue;
 	int i;
@@ -1965,24 +1963,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	unsigned long lowest_key = 0;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 
-	if (prio_changed) {
-		/*
-		 * When priorities switched, we prefer starting
-		 * from SYNC_NOIDLE (first choice), or just SYNC
-		 * over ASYNC
-		 */
-		if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
-			return cur_best;
-		cur_best = SYNC_WORKLOAD;
-		if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
-			return cur_best;
-
-		return ASYNC_WORKLOAD;
-	}
-
-	for (i = 0; i < 3; ++i) {
-		/* otherwise, select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd));
+	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
+		/* select the one with lowest rb_key */
+		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
@@ -1996,8 +1979,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 
 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	enum wl_prio_t previous_prio = cfqd->serving_prio;
-	bool prio_changed;
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
@@ -2025,24 +2006,19 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
-	prio_changed = (cfqd->serving_prio != previous_prio);
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
-				cfqd);
+	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 
 	/*
-	 * If priority didn't change, check workload expiration,
-	 * and that we still have other queues ready
+	 * check workload expiration, and that we still have other queues ready
 	 */
-	if (!prio_changed && count &&
-	    !time_after(jiffies, cfqd->workload_expires))
+	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 
 	/* otherwise select new workload type */
 	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed);
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
-				cfqd);
+		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
+	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 9504e0864b58b4a304820dcf3755f1da80d5e72f Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 21 Dec 2009 15:55:51 +0100
Subject: block: Fix topology stacking for data and discard alignment

The stacking code incorrectly scaled up the data offset in some cases
causing misaligned devices to report alignment.  Rewrite the stacking
algorithm to remedy this and apply the same alignment principles to the
discard handling.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 87 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ae118d6e19..e14fcbcedbf 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -517,9 +517,8 @@ static unsigned int lcm(unsigned int a, unsigned int b)
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
-	int ret;
-
-	ret = 0;
+	sector_t alignment;
+	unsigned int top, bottom, granularity;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -537,6 +536,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
+	granularity = max(b->physical_block_size, b->io_min);
+	alignment = b->alignment_offset - (offset & (granularity - 1));
+
+	if (t->alignment_offset != alignment) {
+
+		top = max(t->physical_block_size, t->io_min)
+			+ t->alignment_offset;
+		bottom = granularity + alignment;
+
+		if (max(top, bottom) & (min(top, bottom) - 1))
+			t->misaligned = 1;
+	}
+
 	t->logical_block_size = max(t->logical_block_size,
 				    b->logical_block_size);
 
@@ -544,54 +556,55 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 				     b->physical_block_size);
 
 	t->io_min = max(t->io_min, b->io_min);
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+
 	t->no_cluster |= b->no_cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 
-	/* Bottom device offset aligned? */
-	if (offset &&
-	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+	if (t->physical_block_size & (t->logical_block_size - 1)) {
+		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
-		ret = -1;
 	}
 
-	/*
-	 * Temporarily disable discard granularity. It's currently buggy
-	 * since we default to 0 for discard_granularity, hence this
-	 * "failure" will always trigger for non-zero offsets.
-	 */
-#if 0
-	if (offset &&
-	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
-		t->discard_misaligned = 1;
-		ret = -1;
+	if (t->io_min & (t->physical_block_size - 1)) {
+		t->io_min = t->physical_block_size;
+		t->misaligned = 1;
 	}
-#endif
 
-	/* If top has no alignment offset, inherit from bottom */
-	if (!t->alignment_offset)
-		t->alignment_offset =
-			b->alignment_offset & (b->physical_block_size - 1);
+	if (t->io_opt & (t->physical_block_size - 1)) {
+		t->io_opt = 0;
+		t->misaligned = 1;
+	}
 
-	if (!t->discard_alignment)
-		t->discard_alignment =
-			b->discard_alignment & (b->discard_granularity - 1);
+	t->alignment_offset = lcm(t->alignment_offset, alignment)
+		& (max(t->physical_block_size, t->io_min) - 1);
 
-	/* Top device aligned on logical block boundary? */
-	if (t->alignment_offset & (t->logical_block_size - 1)) {
+	if (t->alignment_offset & (t->logical_block_size - 1))
 		t->misaligned = 1;
-		ret = -1;
-	}
 
-	/* Find lcm() of optimal I/O size and granularity */
-	t->io_opt = lcm(t->io_opt, b->io_opt);
-	t->discard_granularity = lcm(t->discard_granularity,
-				     b->discard_granularity);
+	/* Discard alignment and granularity */
+	if (b->discard_granularity) {
+
+		alignment = b->discard_alignment -
+			(offset & (b->discard_granularity - 1));
+
+		if (t->discard_granularity != 0 &&
+		    t->discard_alignment != alignment) {
+			top = t->discard_granularity + t->discard_alignment;
+			bottom = b->discard_granularity + alignment;
 
-	/* Verify that optimal I/O size is a multiple of io_min */
-	if (t->io_min && t->io_opt % t->io_min)
-		ret = -1;
+			/* Verify that top and bottom intervals line up */
+			if (max(top, bottom) & (min(top, bottom) - 1))
+				t->discard_misaligned = 1;
+		}
+
+		t->discard_granularity = max(t->discard_granularity,
+					     b->discard_granularity);
+		t->discard_alignment = lcm(t->discard_alignment, alignment) &
+			(t->discard_granularity - 1);
+	}
 
-	return ret;
+	return t->misaligned ? -1 : 0;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
-- 
cgit v1.2.3-70-g09d2


From 2f7a2d89a8b5915d89ad9ebeb0065db7d5831cea Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 28 Dec 2009 13:18:44 +0100
Subject: cfq-iosched: don't regard requests with long distance as close

seek_mean could be very big sometimes, using it as close criteria is meaningless
as this doen't improve any performance. So if it's big, let's fallback to
default value.

Reviewed-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8df4fe58f4e..918c7fd9aeb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1667,13 +1667,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 #define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
 
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			       struct request *rq)
+			       struct request *rq, bool for_preempt)
 {
 	sector_t sdist = cfqq->seek_mean;
 
 	if (!sample_valid(cfqq->seek_samples))
 		sdist = CFQQ_SEEK_THR;
 
+	/* if seek_mean is big, using it as close criteria is meaningless */
+	if (sdist > CFQQ_SEEK_THR && !for_preempt)
+		sdist = CFQQ_SEEK_THR;
+
 	return cfq_dist_from_last(cfqd, rq) <= sdist;
 }
 
@@ -1701,7 +1705,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
 		return __cfqq;
 
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1712,7 +1716,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 		return NULL;
 
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
 		return __cfqq;
 
 	return NULL;
@@ -3112,7 +3116,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, cfqq, rq))
+	if (cfq_rq_close(cfqd, cfqq, rq, true))
 		return true;
 
 	return false;
-- 
cgit v1.2.3-70-g09d2


From 81744ee44ab2845c16ffd7d6f762f7b4a49a4750 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 29 Dec 2009 08:35:35 +0100
Subject: block: Fix incorrect alignment offset reporting and update
 documentation

queue_sector_alignment_offset returned the wrong value which caused
partitions to report an incorrect alignment_offset.  Since offset
alignment calculation is needed several places it has been split into a
separate helper function.  The topology stacking function has been
updated accordingly.

Furthermore, comments have been added to clarify how the stacking
function works.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Tested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 44 +++++++++++++++++++++++++++++++++-----------
 include/linux/blkdev.h | 11 +++++++++--
 2 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index e14fcbcedbf..d52d4adc440 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -505,20 +505,30 @@ static unsigned int lcm(unsigned int a, unsigned int b)
 
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
- * @t:	the stacking driver limits (top)
- * @b:  the underlying queue limits (bottom)
+ * @t:	the stacking driver limits (top device)
+ * @b:  the underlying queue limits (bottom, component device)
  * @offset:  offset to beginning of data within component device
  *
  * Description:
- *    Merges two queue_limit structs.  Returns 0 if alignment didn't
- *    change.  Returns -1 if adding the bottom device caused
- *    misalignment.
+ *    This function is used by stacking drivers like MD and DM to ensure
+ *    that all component devices have compatible block sizes and
+ *    alignments.  The stacking driver must provide a queue_limits
+ *    struct (top) and then iteratively call the stacking function for
+ *    all component (bottom) devices.  The stacking function will
+ *    attempt to combine the values and ensure proper alignment.
+ *
+ *    Returns 0 if the top and bottom queue_limits are compatible.  The
+ *    top device's block sizes and alignment offsets may be adjusted to
+ *    ensure alignment with the bottom device. If no compatible sizes
+ *    and alignments exist, -1 is returned and the resulting top
+ *    queue_limits will have the misaligned flag set to indicate that
+ *    the alignment_offset is undefined.
  */
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
 	sector_t alignment;
-	unsigned int top, bottom, granularity;
+	unsigned int top, bottom;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -536,15 +546,18 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
-	granularity = max(b->physical_block_size, b->io_min);
-	alignment = b->alignment_offset - (offset & (granularity - 1));
+	alignment = queue_limit_alignment_offset(b, offset);
 
+	/* Bottom device has different alignment.  Check that it is
+	 * compatible with the current top alignment.
+	 */
 	if (t->alignment_offset != alignment) {
 
 		top = max(t->physical_block_size, t->io_min)
 			+ t->alignment_offset;
-		bottom = granularity + alignment;
+		bottom = max(b->physical_block_size, b->io_min) + alignment;
 
+		/* Verify that top and bottom intervals line up */
 		if (max(top, bottom) & (min(top, bottom) - 1))
 			t->misaligned = 1;
 	}
@@ -561,32 +574,39 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->no_cluster |= b->no_cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 
+	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
 	}
 
+	/* Minimum I/O a multiple of the physical block size? */
 	if (t->io_min & (t->physical_block_size - 1)) {
 		t->io_min = t->physical_block_size;
 		t->misaligned = 1;
 	}
 
+	/* Optimal I/O a multiple of the physical block size? */
 	if (t->io_opt & (t->physical_block_size - 1)) {
 		t->io_opt = 0;
 		t->misaligned = 1;
 	}
 
+	/* Find lowest common alignment_offset */
 	t->alignment_offset = lcm(t->alignment_offset, alignment)
 		& (max(t->physical_block_size, t->io_min) - 1);
 
+	/* Verify that new alignment_offset is on a logical block boundary */
 	if (t->alignment_offset & (t->logical_block_size - 1))
 		t->misaligned = 1;
 
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
+		unsigned int granularity = b->discard_granularity;
+		offset &= granularity - 1;
 
-		alignment = b->discard_alignment -
-			(offset & (b->discard_granularity - 1));
+		alignment = (granularity + b->discard_alignment - offset)
+			& (granularity - 1);
 
 		if (t->discard_granularity != 0 &&
 		    t->discard_alignment != alignment) {
@@ -598,6 +618,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 				t->discard_misaligned = 1;
 		}
 
+		t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
+						      b->max_discard_sectors);
 		t->discard_granularity = max(t->discard_granularity,
 					     b->discard_granularity);
 		t->discard_alignment = lcm(t->discard_alignment, alignment) &
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 784a919aa0d..59b832be304 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1116,11 +1116,18 @@ static inline int queue_alignment_offset(struct request_queue *q)
 	return q->limits.alignment_offset;
 }
 
+static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset)
+{
+	unsigned int granularity = max(lim->physical_block_size, lim->io_min);
+
+	offset &= granularity - 1;
+	return (granularity + lim->alignment_offset - offset) & (granularity - 1);
+}
+
 static inline int queue_sector_alignment_offset(struct request_queue *q,
 						sector_t sector)
 {
-	return ((sector << 9) - q->limits.alignment_offset)
-		& (q->limits.io_min - 1);
+	return queue_limit_alignment_offset(&q->limits, sector << 9);
 }
 
 static inline int bdev_alignment_offset(struct block_device *bdev)
-- 
cgit v1.2.3-70-g09d2


From e79e95db5cffb2e01170d510686489c40937faa1 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Tue, 29 Dec 2009 08:53:54 +0100
Subject: block: Honor the gfp_mask for alloc_page() in blkdev_issue_discard()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8873b9b439f..8618d8996fe 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -402,7 +402,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * our current implementations need.  If we'll ever need
 		 * more the interface will need revisiting.
 		 */
-		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		page = alloc_page(gfp_mask | __GFP_ZERO);
 		if (!page)
 			goto out_free_bio;
 		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
-- 
cgit v1.2.3-70-g09d2


From fe0b393f2c0a0d23a9bc9ed7dc51a1ee511098bd Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:47 -0500
Subject: block: Correct handling of bottom device misaligment

The top device misalignment flag would not be set if the added bottom
device was already misaligned as opposed to causing a stacking failure.

Also massage the reporting so that an error is only returned if adding
the bottom device caused the misalignment.  I.e. don't return an error
if the top is already flagged as misaligned.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index d52d4adc440..127f8255185 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -528,7 +528,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
 	sector_t alignment;
-	unsigned int top, bottom;
+	unsigned int top, bottom, ret = 0;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -546,6 +546,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
+	t->misaligned |= b->misaligned;
+
 	alignment = queue_limit_alignment_offset(b, offset);
 
 	/* Bottom device has different alignment.  Check that it is
@@ -558,8 +560,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		bottom = max(b->physical_block_size, b->io_min) + alignment;
 
 		/* Verify that top and bottom intervals line up */
-		if (max(top, bottom) & (min(top, bottom) - 1))
+		if (max(top, bottom) & (min(top, bottom) - 1)) {
 			t->misaligned = 1;
+			ret = -1;
+		}
 	}
 
 	t->logical_block_size = max(t->logical_block_size,
@@ -578,18 +582,21 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
+		ret = -1;
 	}
 
 	/* Minimum I/O a multiple of the physical block size? */
 	if (t->io_min & (t->physical_block_size - 1)) {
 		t->io_min = t->physical_block_size;
 		t->misaligned = 1;
+		ret = -1;
 	}
 
 	/* Optimal I/O a multiple of the physical block size? */
 	if (t->io_opt & (t->physical_block_size - 1)) {
 		t->io_opt = 0;
 		t->misaligned = 1;
+		ret = -1;
 	}
 
 	/* Find lowest common alignment_offset */
@@ -597,8 +604,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		& (max(t->physical_block_size, t->io_min) - 1);
 
 	/* Verify that new alignment_offset is on a logical block boundary */
-	if (t->alignment_offset & (t->logical_block_size - 1))
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
+		ret = -1;
+	}
 
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
@@ -626,7 +635,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			(t->discard_granularity - 1);
 	}
 
-	return t->misaligned ? -1 : 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
-- 
cgit v1.2.3-70-g09d2


From dd3d145d49c5816b79acc6761ebbd842bc50b0ee Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:48 -0500
Subject: block: Fix discard alignment calculation and printing

Discard alignment reporting for partitions was incorrect.  Update to
match the algorithm used elsewhere.

The alignment can be negative (misaligned).  Fix format string
accordingly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c          | 2 +-
 include/linux/blkdev.h | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index b11a4ad7d57..d13ba76a169 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -867,7 +867,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9b98173a818..a41bcc8e140 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1148,8 +1148,11 @@ static inline int queue_discard_alignment(struct request_queue *q)
 static inline int queue_sector_discard_alignment(struct request_queue *q,
 						 sector_t sector)
 {
-	return ((sector << 9) - q->limits.discard_alignment)
-		& (q->limits.discard_granularity - 1);
+	struct queue_limits *lim = &q->limits;
+	unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
+
+	return (lim->discard_granularity + lim->discard_alignment - alignment)
+		& (lim->discard_granularity - 1);
 }
 
 static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-- 
cgit v1.2.3-70-g09d2


From 17be8c245054b9c7786545af3ba3ca4e54cd4ad9 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:49 -0500
Subject: block: bdev_stack_limits wrapper

DM does not want to know about partition offsets.  Add a partition-aware
wrapper that DM can use when stacking block devices.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 22 ++++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 2 files changed, 24 insertions(+)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 127f8255185..5eeb9e0d256 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -639,6 +639,28 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
+/**
+ * bdev_stack_limits - adjust queue limits for stacked drivers
+ * @t:	the stacking driver limits (top device)
+ * @bdev:  the component block_device (bottom)
+ * @start:  first data sector within component device
+ *
+ * Description:
+ *    Merges queue limits for a top device and a block_device.  Returns
+ *    0 if alignment didn't change.  Returns -1 if adding the bottom
+ *    device caused misalignment.
+ */
+int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
+		      sector_t start)
+{
+	struct request_queue *bq = bdev_get_queue(bdev);
+
+	start += get_start_sect(bdev);
+
+	return blk_stack_limits(t, &bq->limits, start << 9);
+}
+EXPORT_SYMBOL(bdev_stack_limits);
+
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
  * @disk:  MD/DM gendisk (top)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a41bcc8e140..5c8018977ef 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -938,6 +938,8 @@ extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
+extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
+			    sector_t offset);
 extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
-- 
cgit v1.2.3-70-g09d2


From ce289321b7dc1eb108e3df0dec872b7429ef49f7 Mon Sep 17 00:00:00 2001
From: Kirill Afonshin <kirill_nnov@mail.ru>
Date: Fri, 8 Jan 2010 22:09:59 +0300
Subject: block: removed unused as_io_context

It isn't used anymore, since AS was deleted.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c           |  5 -----
 include/linux/iocontext.h | 27 ---------------------------
 2 files changed, 32 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index cbdabb0dd6d..98e6bf61b0a 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -39,8 +39,6 @@ int put_io_context(struct io_context *ioc)
 
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
-		if (ioc->aic && ioc->aic->dtor)
-			ioc->aic->dtor(ioc->aic);
 		cfq_dtor(ioc);
 		rcu_read_unlock();
 
@@ -76,8 +74,6 @@ void exit_io_context(struct task_struct *task)
 	task_unlock(task);
 
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
-		if (ioc->aic && ioc->aic->exit)
-			ioc->aic->exit(ioc->aic);
 		cfq_exit(ioc);
 
 	}
@@ -97,7 +93,6 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		ret->ioprio = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
-		ret->aic = NULL;
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
 		ret->ioc_data = NULL;
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index a6323599630..78ef023227d 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -4,32 +4,6 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 
-/*
- * This is the per-process anticipatory I/O scheduler state.
- */
-struct as_io_context {
-	spinlock_t lock;
-
-	void (*dtor)(struct as_io_context *aic); /* destructor */
-	void (*exit)(struct as_io_context *aic); /* called on task exit */
-
-	unsigned long state;
-	atomic_t nr_queued; /* queued reads & sync writes */
-	atomic_t nr_dispatched; /* number of requests gone to the drivers */
-
-	/* IO History tracking */
-	/* Thinktime */
-	unsigned long last_end_request;
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-	/* Layout pattern */
-	unsigned int seek_samples;
-	sector_t last_request_pos;
-	u64 seek_total;
-	sector_t seek_mean;
-};
-
 struct cfq_queue;
 struct cfq_io_context {
 	void *key;
@@ -78,7 +52,6 @@ struct io_context {
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
-	struct as_io_context *aic;
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void *ioc_data;
-- 
cgit v1.2.3-70-g09d2


From 875feb63b9567442be73efbcc9a8470e376d6423 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Wed, 6 Jan 2010 18:58:20 -0800
Subject: cfq-iosched: Respect ioprio_class when preempting

In cfq_should_preempt(), we currently allow some cases where a non-RT request
can preempt an ongoing RT cfqq timeslice. This should not happen.
Examples include:

o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq
  on which we are idling.
o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT
  async cfqq.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 918c7fd9aeb..ee130f14d1f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3076,6 +3076,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_idle(cfqq))
 		return true;
 
+	/*
+	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
+	 */
+	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
+		return false;
+
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
-- 
cgit v1.2.3-70-g09d2