1 files changed, 65 insertions, 74 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0f5879c42dc..ad69ef657e8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,28 +33,6 @@ static LIST_HEAD(all_q_list);
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
 
-static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
-					   unsigned int cpu)
-{
-	return per_cpu_ptr(q->queue_ctx, cpu);
-}
-
-/*
- * This assumes per-cpu software queueing queues. They could be per-node
- * as well, for instance. For now this is hardcoded as-is. Note that we don't
- * care about preemption, since we know the ctx's are persistent. This does
- * mean that we can't rely on ctx always matching the currently running CPU.
- */
-static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
-{
-	return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
-	put_cpu();
-}
-
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -104,8 +82,10 @@ static int blk_mq_queue_enter(struct request_queue *q)
 
 	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
 	smp_wmb();
-	/* we have problems to freeze the queue if it's initializing */
-	if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+
+	/* we have problems freezing the queue if it's initializing */
+	if (!blk_queue_dying(q) &&
+	    (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
 		return 0;
 
 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
@@ -129,7 +109,7 @@ static void blk_mq_queue_exit(struct request_queue *q)
 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 }
 
-static void __blk_mq_drain_queue(struct request_queue *q)
+void blk_mq_drain_queue(struct request_queue *q)
 {
 	while (true) {
 		s64 count;
@@ -140,7 +120,7 @@ static void __blk_mq_drain_queue(struct request_queue *q)
 
 		if (count == 0)
 			break;
-		blk_mq_run_queues(q, false);
+		blk_mq_start_hw_queues(q);
 		msleep(10);
 	}
 }
@@ -159,12 +139,7 @@ static void blk_mq_freeze_queue(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	if (drain)
-		__blk_mq_drain_queue(q);
-}
-
-void blk_mq_drain_queue(struct request_queue *q)
-{
-	__blk_mq_drain_queue(q);
+		blk_mq_drain_queue(q);
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -205,6 +180,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
+	rq->start_time = jiffies;
 #ifdef CONFIG_BLK_CGROUP
 	rq->rl = NULL;
 	set_start_time_ns(rq);
@@ -224,6 +200,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->sense = NULL;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
+	rq->timeout = 0;
+
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
@@ -232,24 +210,23 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 }
 
 static struct request *
-__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-		struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 {
 	struct request *rq;
 	unsigned int tag;
 
-	tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
+	tag = blk_mq_get_tag(data);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = hctx->tags->rqs[tag];
+		rq = data->hctx->tags->rqs[tag];
 
 		rq->cmd_flags = 0;
-		if (blk_mq_tag_busy(hctx)) {
+		if (blk_mq_tag_busy(data->hctx)) {
 			rq->cmd_flags = REQ_MQ_INFLIGHT;
-			atomic_inc(&hctx->nr_active);
+			atomic_inc(&data->hctx->nr_active);
 		}
 
 		rq->tag = tag;
-		blk_mq_rq_ctx_init(q, ctx, rq, rw);
+		blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
 		return rq;
 	}
 
@@ -262,22 +239,27 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
+	struct blk_mq_alloc_data alloc_data;
 
 	if (blk_mq_queue_enter(q))
 		return NULL;
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+			reserved, ctx, hctx);
 
-	rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT,
-				    reserved);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (!rq && (gfp & __GFP_WAIT)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		rq =  __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved);
+		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
+				hctx);
+		rq =  __blk_mq_alloc_request(&alloc_data, rw);
+		ctx = alloc_data.ctx;
 	}
 	blk_mq_put_ctx(ctx);
 	return rq;
@@ -424,16 +406,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
 	if (unlikely(blk_bidi_rq(rq)))
 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 
-	/*
-	 * Just mark start time and set the started bit. Due to memory
-	 * ordering, we know we'll see the correct deadline as long as
-	 * REQ_ATOMIC_STARTED is seen. Use the default queue timeout,
-	 * unless one has been set in the request.
-	 */
-	if (!rq->timeout)
-		rq->deadline = jiffies + q->rq_timeout;
-	else
-		rq->deadline = jiffies + rq->timeout;
+	blk_add_timer(rq);
 
 	/*
 	 * Mark us as started and clear complete. Complete might have been
@@ -547,15 +520,20 @@ void blk_mq_kick_requeue_list(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
-struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static inline bool is_flush_request(struct request *rq, unsigned int tag)
 {
-	struct request_queue *q = hctx->queue;
+	return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
+			rq->q->flush_rq->tag == tag);
+}
+
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+{
+	struct request *rq = tags->rqs[tag];
 
-	if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
-	    q->flush_rq->tag == tag)
-		return q->flush_rq;
+	if (!is_flush_request(rq, tag))
+		return rq;
 
-	return hctx->tags->rqs[tag];
+	return rq->q->flush_rq;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
@@ -584,7 +562,7 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 		if (tag >= hctx->tags->nr_tags)
 			break;
 
-		rq = blk_mq_tag_to_rq(hctx, tag++);
+		rq = blk_mq_tag_to_rq(hctx->tags, tag++);
 		if (rq->q != hctx->queue)
 			continue;
 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
@@ -900,7 +878,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 
 	preempt_disable();
-	__blk_mq_run_hw_queue(hctx);
+	blk_mq_run_hw_queue(hctx, false);
 	preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
@@ -980,11 +958,6 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
 
 	blk_mq_hctx_mark_pending(hctx, ctx);
-
-	/*
-	 * We do this early, to ensure we are on the right CPU.
-	 */
-	blk_add_timer(rq);
 }
 
 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -1113,10 +1086,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	init_request_from_bio(rq, bio);
 
-	if (blk_do_io_stat(rq)) {
-		rq->start_time = jiffies;
+	if (blk_do_io_stat(rq))
 		blk_account_io_start(rq, 1);
-	}
 }
 
 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
@@ -1158,6 +1129,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
 	int rw = bio_data_dir(bio);
+	struct blk_mq_alloc_data alloc_data;
 
 	if (unlikely(blk_mq_queue_enter(q))) {
 		bio_endio(bio, -EIO);
@@ -1171,7 +1143,9 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		rw |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, rw);
-	rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
+	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
+			hctx);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (unlikely(!rq)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
@@ -1179,8 +1153,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
-					    __GFP_WAIT|GFP_ATOMIC, false);
+		blk_mq_set_alloc_data(&alloc_data, q,
+				__GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+		rq = __blk_mq_alloc_request(&alloc_data, rw);
+		ctx = alloc_data.ctx;
+		hctx = alloc_data.hctx;
 	}
 
 	hctx->queued++;
@@ -1223,7 +1200,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 		blk_mq_bio_to_request(rq, bio);
 		blk_mq_start_request(rq, true);
-		blk_add_timer(rq);
 
 		/*
 		 * For OK queue, we are done. For error, kill it. Any other
@@ -1288,6 +1264,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
+	if (unlikely(!rq))
+		return;
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
@@ -1562,6 +1540,8 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 		if (i == nr_queue)
 			break;
 
+		blk_mq_tag_idle(hctx);
+
 		if (set->ops->exit_hctx)
 			set->ops->exit_hctx(hctx, i);
 
@@ -1779,7 +1759,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx **hctxs;
-	struct blk_mq_ctx *ctx;
+	struct blk_mq_ctx __percpu *ctx;
 	struct request_queue *q;
 	unsigned int *map;
 	int i;
@@ -1970,13 +1950,19 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/*
+ * Alloc a tag set to be associated with one or more request queues.
+ * May fail with EINVAL for various error conditions. May adjust the
+ * requested depth down, if if it too large. In that case, the set
+ * value will be stored in set->queue_depth.
+ */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 
 	if (!set->nr_hw_queues)
 		return -EINVAL;
-	if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+	if (!set->queue_depth)
 		return -EINVAL;
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
@@ -1984,6 +1970,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
 		return -EINVAL;
 
+	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
+		pr_info("blk-mq: reduced tag depth to %u\n",
+			BLK_MQ_MAX_DEPTH);
+		set->queue_depth = BLK_MQ_MAX_DEPTH;
+	}
 
 	set->tags = kmalloc_node(set->nr_hw_queues *
 				 sizeof(struct blk_mq_tags *),