From f0276924fa35a3607920a58cf5d878212824b951 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Dec 2013 11:38:50 +0800 Subject: blk-mq: Don't reserve a tag for flush request Reserving a tag (request) for flush to avoid dead lock is a overkill. A tag is valuable resource. We can track the number of flush requests and disallow having too many pending flush requests allocated. With this patch, blk_mq_alloc_request_pinned() could do a busy nop (but not a dead loop) if too many pending requests are allocated and new flush request is allocated. But this should not be a problem, too many pending flush requests are very rare case. I verified this can fix the deadlock caused by too many pending flush requests. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-flush.c | 8 +++++--- block/blk-mq.c | 46 ++++++++++++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 9288aaf35c2..9143e85226c 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -284,9 +284,8 @@ static void mq_flush_work(struct work_struct *work) q = container_of(work, struct request_queue, mq_flush_work); - /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, true); + __GFP_WAIT|GFP_ATOMIC, false); rq->cmd_type = REQ_TYPE_FS; rq->end_io = flush_end_io; @@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. + * We keep REQ_FLUSH for mq to track flush requests. For !FUA, + * we never dispatch the request directly. */ - rq->cmd_flags &= ~REQ_FLUSH; + if (rq->cmd_flags & REQ_FUA) + rq->cmd_flags &= ~REQ_FLUSH; if (!(fflags & REQ_FUA)) rq->cmd_flags &= ~REQ_FUA; diff --git a/block/blk-mq.c b/block/blk-mq.c index 57039fcd9c9..9072d0ab184 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,9 +194,27 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) + gfp_t gfp, bool reserved, + int rw) { - return blk_mq_alloc_rq(hctx, gfp, reserved); + struct request *req; + bool is_flush = false; + /* + * flush need allocate a request, leave at least one request for + * non-flush IO to avoid deadlock + */ + if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) { + if (atomic_inc_return(&hctx->pending_flush) >= + hctx->queue_depth - hctx->reserved_tags - 1) { + atomic_dec(&hctx->pending_flush); + return NULL; + } + is_flush = true; + } + req = blk_mq_alloc_rq(hctx, gfp, reserved); + if (!req && is_flush) + atomic_dec(&hctx->pending_flush); + return req; } static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, @@ -209,7 +227,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -272,6 +290,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; + if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + atomic_dec(&hctx->pending_flush); + blk_mq_rq_init(hctx, rq); blk_mq_put_tag(hctx->tags, tag); @@ -900,14 +921,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) hctx = q->mq_ops->map_queue(q, ctx->cpu); trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw); if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); + blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); + rq = blk_mq_alloc_request_pinned(q, bio->bi_rw, + __GFP_WAIT|GFP_ATOMIC, false); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); } @@ -1184,7 +1205,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->queue_num = i; hctx->flags = reg->flags; hctx->queue_depth = reg->queue_depth; + hctx->reserved_tags = reg->reserved_tags; hctx->cmd_size = reg->cmd_size; + atomic_set(&hctx->pending_flush, 0); blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1309,15 +1332,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, reg->queue_depth = BLK_MQ_MAX_DEPTH; } - /* - * Set aside a tag for flush requests. It will only be used while - * another flush request is in progress but outside the driver. - * - * TODO: only allocate if flushes are supported - */ - reg->queue_depth++; - reg->reserved_tags++; - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) return ERR_PTR(-EINVAL); -- cgit v1.2.3-70-g09d2 From 556ee818c06f37b2e583af0363e6b16d0e0270de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 29 Jan 2014 14:56:16 -0700 Subject: block: __elv_next_request() shouldn't call into the elevator if bypassing request_queue bypassing is used to suppress higher-level function of a request_queue so that they can be switched, reconfigured and shut down. A request_queue does the followings while bypassing. * bypasses elevator and io_cq association and queues requests directly to the FIFO dispatch queue. * bypasses block cgroup request_list lookup and always uses the root request_list. Once confirmed to be bypassing, specific elevator and block cgroup policy implementations can assume that nothing is in flight for them and perform various operations which would be dangerous otherwise. Such confirmation is acheived by short-circuiting all new requests directly to the dispatch queue and waiting for all the requests which were issued before to finish. Unfortunately, while the request allocating and draining sides were properly handled, we forgot to actually plug the request dispatch path. Even after bypassing mode is confirmed, if the attached driver tries to fetch a request and the dispatch queue is empty, __elv_next_request() would invoke the current elevator's elevator_dispatch_fn() callback. As all in-flight requests were drained, the elevator wouldn't contain any request but once bypass is confirmed we don't even know whether the elevator is even there. It might be in the process of being switched and half torn down. Frank Mayhar reports that this actually happened while switching elevators, leading to an oops. Let's fix it by making __elv_next_request() avoid invoking the elevator_dispatch_fn() callback if the queue is bypassing. It already avoids invoking the callback if the queue is dying. As a dying queue is guaranteed to be bypassing, we can simply replace blk_queue_dying() check with blk_queue_bypass(). Reported-by: Frank Mayhar References: http://lkml.kernel.org/g/1390319905.20232.38.camel@bobble.lax.corp.google.com Cc: stable@vger.kernel.org Tested-by: Frank Mayhar Signed-off-by: Jens Axboe --- block/blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index c90e1d8f7a2..d23b415b8a2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -113,7 +113,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) q->flush_queue_delayed = 1; return NULL; } - if (unlikely(blk_queue_dying(q)) || + if (unlikely(blk_queue_bypass(q)) || !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) return NULL; } -- cgit v1.2.3-70-g09d2 From 72a0a36e2854a6eadb4cf2561858f613f9cd4639 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:36 -0800 Subject: blk-mq: support at_head inserations for blk_execute_rq This is neede for proper SG_IO operation as well as various uses of blk_execute_rq from the SCSI midlayer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- block/blk-mq.c | 17 ++++++++++------- include/linux/blk-mq.h | 3 ++- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index bbfc072a79c..c68613bb4c7 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be resued after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(q, rq, true); + blk_mq_insert_request(q, rq, at_head, true); return; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 9072d0ab184..c9306e3403f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -714,13 +714,16 @@ static void blk_mq_work_fn(struct work_struct *work) } static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq) + struct request *rq, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; trace_block_rq_insert(hctx->queue, rq); - list_add_tail(&rq->queuelist, &ctx->rq_list); + if (at_head) + list_add(&rq->queuelist, &ctx->rq_list); + else + list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); /* @@ -730,7 +733,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, } void blk_mq_insert_request(struct request_queue *q, struct request *rq, - bool run_queue) + bool at_head, bool run_queue) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx, *current_ctx; @@ -749,7 +752,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq, rq->mq_ctx = ctx; } spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -781,7 +784,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async) /* ctx->cpu might be offline */ spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -819,7 +822,7 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); @@ -971,7 +974,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) __blk_mq_free_request(hctx, ctx, rq); else { blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1e8f16f65af..b7638be5859 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -122,7 +122,8 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request_queue *, struct request *, bool); +void blk_mq_insert_request(struct request_queue *, struct request *, + bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -- cgit v1.2.3-70-g09d2 From 6f5ba581c0d3ba0a76fe138123c1c2817ffcbeb1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:37 -0800 Subject: blk-mq: divert __blk_put_request for MQ ops __blk_put_request needs to call into the blk-mq code just like blk_put_request. As we don't have the queue lock in this case both end up calling the same function. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c00e0bdeab4..06636f3ad42 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1278,6 +1278,11 @@ void __blk_put_request(struct request_queue *q, struct request *req) if (unlikely(!q)) return; + if (q->mq_ops) { + blk_mq_free_request(req); + return; + } + blk_pm_put_request(req); elv_completed_request(q, req); -- cgit v1.2.3-70-g09d2 From 4f7f418c4835d3ce1b66d00502df41f324d13ec0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:38 -0800 Subject: blk-mq: handle dma_drain_size Make blk-mq handle the dma_drain_size field the same way as the old request path. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c9306e3403f..a99bea45545 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -582,6 +582,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) list_del_init(&rq->queuelist); blk_mq_start_request(rq); + if (q->dma_drain_size && blk_rq_bytes(rq)) { + /* + * make sure space for the drain appears we + * know we can do this because max_hw_segments + * has been adjusted to be one fewer than the + * device can handle + */ + rq->nr_phys_segments++; + } + /* * Last request in the series. Flag it as such, this * enables drivers to know when IO should be kicked off, -- cgit v1.2.3-70-g09d2 From 1be036e9464032362def6b3c13f57bfceefe2dab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:39 -0800 Subject: blk-mq: initialize sg_reserved_size To behave the same way as the old request path. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a99bea45545..f1e63c2ece7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1387,6 +1387,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, q->mq_ops = reg->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + q->sg_reserved_size = INT_MAX; + blk_queue_make_request(q, blk_mq_make_request); blk_queue_rq_timed_out(q, reg->ops->timeout); if (reg->timeout) -- cgit v1.2.3-70-g09d2 From 14ec77f352cb00ab8425ec2af03bd7e529eefe24 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 7 Feb 2014 13:45:39 -0700 Subject: blk-mq: Add bio_integrity setup to blk_mq_make_request This patch adds the missing bio_integrity_enabled() + bio_integrity_prep() setup into blk_mq_make_request() in order to use DIF protection with scsi-mq. Cc: Martin K. Petersen Signed-off-by: Nicholas Bellinger Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f1e63c2ece7..cee96234bf5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -922,6 +922,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) return; -- cgit v1.2.3-70-g09d2 From 5cb8850c9c4a7605f74f5c9c7ecadd0b02e87a25 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2014 13:53:46 -0700 Subject: block: Explicitly handle discard/write same segments Immutable biovecs changed the way biovecs are interpreted - drivers no longer use bi_vcnt, they have to go by bi_iter.bi_size (to allow for using part of an existing segment without modifying it). This breaks with discards and write_same bios, since for those bi_size has nothing to do with segments in the biovec. So for now, we need a fairly gross hack - we fortunately know that there will never be more than one segment for the entire request, so we can special case discard/write_same. Signed-off-by: Kent Overstreet Tested-by: Hugh Dickins Signed-off-by: Jens Axboe --- block/blk-merge.c | 91 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 8f8adaa9546..6c583f9c5b6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,6 +21,16 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (!bio) return 0; + /* + * This should probably be returning 0, but blk_add_request_payload() + * (Christoph!!!!) + */ + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + fbio = bio; cluster = blk_queue_cluster(q); seg_size = 0; @@ -161,30 +171,60 @@ new_segment: *bvprv = *bvec; } -/* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries - */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist, + struct scatterlist **sg) { struct bio_vec bvec, bvprv = { NULL }; - struct req_iterator iter; - struct scatterlist *sg; + struct bvec_iter iter; int nsegs, cluster; nsegs = 0; cluster = blk_queue_cluster(q); - /* - * for each bio in rq - */ - sg = NULL; - rq_for_each_segment(bvec, rq, iter) { - __blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg, - &nsegs, &cluster); - } /* segments in rq */ + if (bio->bi_rw & REQ_DISCARD) { + /* + * This is a hack - drivers should be neither modifying the + * biovec, nor relying on bi_vcnt - but because of + * blk_add_request_payload(), a discard bio may or may not have + * a payload we need to set up here (thank you Christoph) and + * bi_vcnt is really the only way of telling if we need to. + */ + + if (bio->bi_vcnt) + goto single_segment; + + return 0; + } + + if (bio->bi_rw & REQ_WRITE_SAME) { +single_segment: + *sg = sglist; + bvec = bio_iovec(bio); + sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); + return 1; + } + + for_each_bio(bio) + bio_for_each_segment(bvec, bio, iter) + __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, + &nsegs, &cluster); + return nsegs; +} + +/* + * map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries + */ +int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *sg = NULL; + int nsegs = 0; + + if (rq->bio) + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); if (unlikely(rq->cmd_flags & REQ_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { @@ -230,20 +270,13 @@ EXPORT_SYMBOL(blk_rq_map_sg); int blk_bio_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist) { - struct bio_vec bvec, bvprv = { NULL }; - struct scatterlist *sg; - int nsegs, cluster; - struct bvec_iter iter; - - nsegs = 0; - cluster = blk_queue_cluster(q); - - sg = NULL; - bio_for_each_segment(bvec, bio, iter) { - __blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg, - &nsegs, &cluster); - } /* segments in bio */ + struct scatterlist *sg = NULL; + int nsegs; + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); + bio->bi_next = next; if (sg) sg_mark_end(sg); -- cgit v1.2.3-70-g09d2 From 30a91cb4ef385fe1b260df204ef314d86fff2850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 03:24:38 -0800 Subject: blk-mq: rework I/O completions Rework I/O completions to work more like the old code path. blk_mq_end_io now stays out of the business of deferring completions to others CPUs and calling blk_mark_rq_complete. The latter is very important to allow completing requests that have timed out and thus are already marked completed, the former allows using the IPI callout even for driver specific completions instead of having to reimplement them. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 52 ++++++++++++++++++++++++++++++-------------------- block/blk-mq.h | 3 +-- block/blk-timeout.c | 2 +- include/linux/blk-mq.h | 4 ++++ 4 files changed, 37 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index cee96234bf5..14c8f35946e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -326,7 +326,7 @@ static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) bio_endio(bio, error); } -void blk_mq_complete_request(struct request *rq, int error) +void blk_mq_end_io(struct request *rq, int error) { struct bio *bio = rq->bio; unsigned int bytes = 0; @@ -351,46 +351,53 @@ void blk_mq_complete_request(struct request *rq, int error) else blk_mq_free_request(rq); } +EXPORT_SYMBOL(blk_mq_end_io); -void __blk_mq_end_io(struct request *rq, int error) -{ - if (!blk_mark_rq_complete(rq)) - blk_mq_complete_request(rq, error); -} - -static void blk_mq_end_io_remote(void *data) +static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; - __blk_mq_end_io(rq, rq->errors); + rq->q->softirq_done_fn(rq); } -/* - * End IO on this request on a multiqueue enabled driver. We'll either do - * it directly inline, or punt to a local IPI handler on the matching - * remote CPU. - */ -void blk_mq_end_io(struct request *rq, int error) +void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; int cpu; - if (!ctx->ipi_redirect) - return __blk_mq_end_io(rq, error); + if (!ctx->ipi_redirect) { + rq->q->softirq_done_fn(rq); + return; + } cpu = get_cpu(); if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { - rq->errors = error; - rq->csd.func = blk_mq_end_io_remote; + rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; __smp_call_function_single(ctx->cpu, &rq->csd, 0); } else { - __blk_mq_end_io(rq, error); + rq->q->softirq_done_fn(rq); } put_cpu(); } -EXPORT_SYMBOL(blk_mq_end_io); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions. + * The actual completion happens out-of-order, through a IPI handler. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (unlikely(blk_should_fake_timeout(rq->q))) + return; + if (!blk_mark_rq_complete(rq)) + __blk_mq_complete_request(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); static void blk_mq_start_request(struct request *rq) { @@ -1399,6 +1406,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, if (reg->timeout) blk_queue_rq_timeout(q, reg->timeout); + if (reg->ops->complete) + blk_queue_softirq_done(q, reg->ops->complete); + blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 5c3917984b0..f29b645f0e1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -22,8 +22,7 @@ struct blk_mq_ctx { struct kobject kobj; }; -void __blk_mq_end_io(struct request *rq, int error); -void blk_mq_complete_request(struct request *rq, int error); +void __blk_mq_complete_request(struct request *rq); void blk_mq_run_request(struct request *rq, bool run_queue, bool async); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index bba81c9348e..d96f7061c6f 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -91,7 +91,7 @@ static void blk_rq_timed_out(struct request *req) case BLK_EH_HANDLED: /* Can we use req->errors here? */ if (q->mq_ops) - blk_mq_complete_request(req, req->errors); + __blk_mq_complete_request(req); else __blk_complete_request(req); break; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b7638be5859..468be242db9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -86,6 +86,8 @@ struct blk_mq_ops { */ rq_timed_out_fn *timeout; + softirq_done_fn *complete; + /* * Override for hctx allocations (should probably go) */ @@ -137,6 +139,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -- cgit v1.2.3-70-g09d2 From 18741986a4b1dc4b1f171634c4191abc3b0fa023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 09:29:00 -0700 Subject: blk-mq: rework flush sequencing logic Witch to using a preallocated flush_rq for blk-mq similar to what's done with the old request path. This allows us to set up the request properly with a tag from the actually allowed range and ->rq_disk as needed by some drivers. To make life easier we also switch to dynamic allocation of ->flush_rq for the old path. This effectively reverts most of "blk-mq: fix for flush deadlock" and "blk-mq: Don't reserve a tag for flush request" Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 15 +++++-- block/blk-flush.c | 105 ++++++++++++++++++------------------------------- block/blk-mq.c | 54 +++++++++---------------- block/blk-mq.h | 1 + block/blk-sysfs.c | 2 + include/linux/blk-mq.h | 5 +-- include/linux/blkdev.h | 11 ++---- 7 files changed, 76 insertions(+), 117 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 06636f3ad42..853f9274920 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; + uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!uninit_q->flush_rq) + goto out_cleanup_queue; + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) - blk_cleanup_queue(uninit_q); - + goto out_free_flush_rq; return q; + +out_free_flush_rq: + kfree(uninit_q->flush_rq); +out_cleanup_queue: + blk_cleanup_queue(uninit_q); + return NULL; } EXPORT_SYMBOL(blk_init_queue_node); @@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, gfp_mask); else return blk_old_get_request(q, rw, gfp_mask); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 9143e85226c..66e2b697f5d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_data_run(struct work_struct *work) +static void mq_flush_run(struct work_struct *work) { struct request *rq; - rq = container_of(work, struct request, mq_flush_data); + rq = container_of(work, struct request, mq_flush_work); memset(&rq->csd, 0, sizeof(rq->csd)); blk_mq_run_request(rq, true, false); } -static void blk_mq_flush_data_insert(struct request *rq) +static bool blk_flush_queue_rq(struct request *rq) { - INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_data); + if (rq->q->mq_ops) { + INIT_WORK(&rq->mq_flush_work, mq_flush_run); + kblockd_schedule_work(rq->q, &rq->mq_flush_work); + return false; + } else { + list_add_tail(&rq->queuelist, &rq->q->queue_head); + return true; + } } /** @@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &q->flush_data_in_flight); - if (q->mq_ops) - blk_mq_flush_data_insert(rq); - else { - list_add(&rq->queuelist, &q->queue_head); - queued = true; - } + queued = blk_flush_queue_rq(rq); break; case REQ_FSEQ_DONE: @@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, } kicked = blk_kick_flush(q); - /* blk_mq_run_flush will run queue */ - if (q->mq_ops) - return queued; return kicked | queued; } @@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error) struct request *rq, *n; unsigned long flags = 0; - if (q->mq_ops) { - blk_mq_free_request(flush_rq); + if (q->mq_ops) spin_lock_irqsave(&q->mq_flush_lock, flags); - } + running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); @@ -263,48 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error) * kblockd. */ if (queued || q->flush_queue_delayed) { - if (!q->mq_ops) - blk_run_queue_async(q); - else - /* - * This can be optimized to only run queues with requests - * queued if necessary. - */ - blk_mq_run_queues(q, true); + WARN_ON(q->mq_ops); + blk_run_queue_async(q); } q->flush_queue_delayed = 0; if (q->mq_ops) spin_unlock_irqrestore(&q->mq_flush_lock, flags); } -static void mq_flush_work(struct work_struct *work) -{ - struct request_queue *q; - struct request *rq; - - q = container_of(work, struct request_queue, mq_flush_work); - - rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, false); - rq->cmd_type = REQ_TYPE_FS; - rq->end_io = flush_end_io; - - blk_mq_run_request(rq, true, false); -} - -/* - * We can't directly use q->flush_rq, because it doesn't have tag and is not in - * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, - * so offload the work to workqueue. - * - * Note: we assume a flush request finished in any hardware queue will flush - * the whole disk cache. - */ -static void mq_run_flush(struct request_queue *q) -{ - kblockd_schedule_work(q, &q->mq_flush_work); -} - /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked @@ -339,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q) * different from running_idx, which means flush is in flight. */ q->flush_pending_idx ^= 1; + if (q->mq_ops) { - mq_run_flush(q); - return true; + struct blk_mq_ctx *ctx = first_rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + + blk_mq_rq_init(hctx, q->flush_rq); + q->flush_rq->mq_ctx = ctx; + + /* + * Reuse the tag value from the fist waiting request, + * with blk-mq the tag is generated during request + * allocation and drivers can rely on it being inside + * the range they asked for. + */ + q->flush_rq->tag = first_rq->tag; + } else { + blk_rq_init(q, q->flush_rq); } - blk_rq_init(q, &q->flush_rq); - q->flush_rq.cmd_type = REQ_TYPE_FS; - q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; - q->flush_rq.rq_disk = first_rq->rq_disk; - q->flush_rq.end_io = flush_end_io; + q->flush_rq->cmd_type = REQ_TYPE_FS; + q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq->rq_disk = first_rq->rq_disk; + q->flush_rq->end_io = flush_end_io; - list_add_tail(&q->flush_rq.queuelist, &q->queue_head); - return true; + return blk_flush_queue_rq(q->flush_rq); } static void flush_data_end_io(struct request *rq, int error) @@ -407,11 +382,8 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. - * We keep REQ_FLUSH for mq to track flush requests. For !FUA, - * we never dispatch the request directly. */ - if (rq->cmd_flags & REQ_FUA) - rq->cmd_flags &= ~REQ_FLUSH; + rq->cmd_flags &= ~REQ_FLUSH; if (!(fflags & REQ_FUA)) rq->cmd_flags &= ~REQ_FUA; @@ -560,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush); void blk_mq_init_flush(struct request_queue *q) { spin_lock_init(&q->mq_flush_lock); - INIT_WORK(&q->mq_flush_work, mq_flush_work); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 14c8f35946e..a59b0565e94 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,27 +194,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved, - int rw) + gfp_t gfp, bool reserved) { - struct request *req; - bool is_flush = false; - /* - * flush need allocate a request, leave at least one request for - * non-flush IO to avoid deadlock - */ - if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) { - if (atomic_inc_return(&hctx->pending_flush) >= - hctx->queue_depth - hctx->reserved_tags - 1) { - atomic_dec(&hctx->pending_flush); - return NULL; - } - is_flush = true; - } - req = blk_mq_alloc_rq(hctx, gfp, reserved); - if (!req && is_flush) - atomic_dec(&hctx->pending_flush); - return req; + return blk_mq_alloc_rq(hctx, gfp, reserved); } static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, @@ -227,7 +209,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw); + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -244,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, return rq; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, - gfp_t gfp, bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) { struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); + rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); if (rq) blk_mq_put_ctx(rq->mq_ctx); return rq; @@ -276,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request); /* * Re-init and set pdu, if we have it */ -static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_rq_init(hctx->queue, rq); @@ -290,9 +271,6 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - atomic_dec(&hctx->pending_flush); - blk_mq_rq_init(hctx, rq); blk_mq_put_tag(hctx->tags, tag); @@ -946,14 +924,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) hctx = q->mq_ops->map_queue(q, ctx->cpu); trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw); + blk_mq_rq_ctx_init(q, ctx, rq, rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, bio->bi_rw, - __GFP_WAIT|GFP_ATOMIC, false); + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, + false); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); } @@ -1230,9 +1208,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->queue_num = i; hctx->flags = reg->flags; hctx->queue_depth = reg->queue_depth; - hctx->reserved_tags = reg->reserved_tags; hctx->cmd_size = reg->cmd_size; - atomic_set(&hctx->pending_flush, 0); blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1412,9 +1388,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); - if (blk_mq_init_hw_queues(q, reg, driver_data)) + q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, + cache_line_size()), GFP_KERNEL); + if (!q->flush_rq) goto err_hw; + if (blk_mq_init_hw_queues(q, reg, driver_data)) + goto err_flush_rq; + blk_mq_map_swqueue(q); mutex_lock(&all_q_mutex); @@ -1422,6 +1403,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, mutex_unlock(&all_q_mutex); return q; + +err_flush_rq: + kfree(q->flush_rq); err_hw: kfree(q->mq_map); err_map: diff --git a/block/blk-mq.h b/block/blk-mq.h index f29b645f0e1..ed0035cd458 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -28,6 +28,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); /* * CPU hotplug helpers diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8095c4a21fc..7500f876dae 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj) if (q->mq_ops) blk_mq_free_queue(q); + kfree(q->flush_rq); + blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 468be242db9..18ba8a627f4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,15 +36,12 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; - atomic_t pending_flush; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; - unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ @@ -129,7 +126,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *, void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0375654adb2..b2d25ecbcbc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -101,7 +101,7 @@ struct request { }; union { struct call_single_data csd; - struct work_struct mq_flush_data; + struct work_struct mq_flush_work; }; struct request_queue *q; @@ -451,13 +451,8 @@ struct request_queue { unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; - union { - struct request flush_rq; - struct { - spinlock_t mq_flush_lock; - struct work_struct mq_flush_work; - }; - }; + struct request *flush_rq; + spinlock_t mq_flush_lock; struct mutex sysfs_lock; -- cgit v1.2.3-70-g09d2 From 11c94444074f40b479a05f6657d935204e992f2e Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 10 Feb 2014 10:39:18 -0700 Subject: block: Fix type mismatch in ssize_t_blk_mq_tag_sysfs_show cppcheck detected following format string mismatch. [blk-mq-tag.c:201]: (warning) %u in format string (no. 1) requires 'unsigned int' but the argument type is 'int'. Change "cpu" from int to unsigned int, because the cpu never become minus value. Signed-off-by: Masanari Iida Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d64a02fb1f7..4025050320b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -182,7 +182,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - int cpu; + unsigned int cpu; if (!tags) return 0; -- cgit v1.2.3-70-g09d2 From 1e93b8c274268038c93763dca65a73b42a081e10 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Feb 2014 08:27:13 -0800 Subject: blk-mq: dont assume rq->errors is set when returning an error from ->queue_rq rq->errors never has been part of the communication protocol between drivers and the block stack and most drivers will not have initialized it. Return -EIO to upper layers when the driver returns BLK_MQ_RQ_QUEUE_ERROR unconditionally. If a driver want to return a different error it can easily do so by returning success after calling blk_mq_end_io itself. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a59b0565e94..0480710a8b4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -605,8 +605,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) break; default: pr_err("blk-mq: bad return on queue: %d\n", ret); - rq->errors = -EIO; case BLK_MQ_RQ_QUEUE_ERROR: + rq->errors = -EIO; blk_mq_end_io(rq, rq->errors); break; } -- cgit v1.2.3-70-g09d2 From 49f5baa5109897b8cee491e8a7c4d74052b6bc1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Feb 2014 08:27:14 -0800 Subject: blk-mq: pair blk_mq_start_request / blk_mq_requeue_request Make sure we have a proper pairing between starting and requeueing requests. Move the dma drain and REQ_END setup into blk_mq_start_request, and make sure blk_mq_requeue_request properly undoes them, giving us a pair of function to prepare and unprepare a request without leaving side effects. Together this ensures we always clean up properly after BLK_MQ_RQ_QUEUE_BUSY returns from ->queue_rq. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 0480710a8b4..1fa9dd153fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -377,7 +377,7 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq) +static void blk_mq_start_request(struct request *rq, bool last) { struct request_queue *q = rq->q; @@ -390,6 +390,25 @@ static void blk_mq_start_request(struct request *rq) */ rq->deadline = jiffies + q->rq_timeout; set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + + if (q->dma_drain_size && blk_rq_bytes(rq)) { + /* + * Make sure space for the drain appears. We know we can do + * this because max_hw_segments has been adjusted to be one + * fewer than the device can handle. + */ + rq->nr_phys_segments++; + } + + /* + * Flag the last request in the series so that drivers know when IO + * should be kicked off, if they don't do it on a per-request basis. + * + * Note: the flag isn't the only condition drivers should do kick off. + * If drive is busy, the last request might not have the bit set. + */ + if (last) + rq->cmd_flags |= REQ_END; } static void blk_mq_requeue_request(struct request *rq) @@ -398,6 +417,11 @@ static void blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + + rq->cmd_flags &= ~REQ_END; + + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; } struct blk_mq_timeout_data { @@ -565,29 +589,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * make sure space for the drain appears we - * know we can do this because max_hw_segments - * has been adjusted to be one fewer than the - * device can handle - */ - rq->nr_phys_segments++; - } - - /* - * Last request in the series. Flag it as such, this - * enables drivers to know when IO should be kicked off, - * if they don't do it on a per-request basis. - * - * Note: the flag isn't the only condition drivers - * should do kick off. If drive is busy, the last - * request might not have the bit set. - */ - if (list_empty(&rq_list)) - rq->cmd_flags |= REQ_END; + blk_mq_start_request(rq, list_empty(&rq_list)); ret = q->mq_ops->queue_rq(hctx, rq); switch (ret) { -- cgit v1.2.3-70-g09d2 From c8123f8c9cb517403b51aa41c3c46ff5e10b2c17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Feb 2014 09:34:01 -0700 Subject: block: add cond_resched() to potentially long running ioctl discard loop When mkfs issues a full device discard and the device only supports discards of a smallish size, we can loop in blkdev_issue_discard() for a long time. If preempt isn't enabled, this can turn into a softlock situation and the kernel will start complaining. Add an explicit cond_resched() at the end of the loop to avoid that. Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-lib.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 2da76c999ef..97a733cf3d5 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -119,6 +119,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, atomic_inc(&bb.done); submit_bio(type, bio); + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); } blk_finish_plug(&plug); -- cgit v1.2.3-70-g09d2