diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 248 |
1 files changed, 184 insertions, 64 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 68929bad9a6..9ee3b87c449 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -107,11 +107,7 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } -/* - * Guarantee no request is in use, so we can change any data structure of - * the queue afterward. - */ -void blk_mq_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue_start(struct request_queue *q) { bool freeze; @@ -123,10 +119,25 @@ void blk_mq_freeze_queue(struct request_queue *q) percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_queues(q, false); } +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); + +static void blk_mq_freeze_queue_wait(struct request_queue *q) +{ wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } -static void blk_mq_unfreeze_queue(struct request_queue *q) +/* + * Guarantee no request is in use, so we can change any data structure of + * the queue afterward. + */ +void blk_mq_freeze_queue(struct request_queue *q) +{ + blk_mq_freeze_queue_start(q); + blk_mq_freeze_queue_wait(q); +} + +void blk_mq_unfreeze_queue(struct request_queue *q) { bool wake; @@ -139,6 +150,24 @@ static void blk_mq_unfreeze_queue(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } } +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); + +void blk_mq_wake_waiters(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_wakeup_all(hctx->tags, true); + + /* + * If we are called because the queue has now been marked as + * dying, we need to ensure that processes currently waiting on + * the queue are notified as well. + */ + wake_up_all(&q->mq_freeze_wq); +} bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) { @@ -248,8 +277,10 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = alloc_data.ctx; } blk_mq_put_ctx(ctx); - if (!rq) + if (!rq) { + blk_mq_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); + } return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); @@ -269,17 +300,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_mq_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + +} +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); } +EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { @@ -365,6 +404,12 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); +int blk_mq_request_started(struct request *rq) +{ + return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +} +EXPORT_SYMBOL_GPL(blk_mq_request_started); + void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -482,12 +527,38 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) } EXPORT_SYMBOL(blk_mq_add_to_requeue_list); +void blk_mq_cancel_requeue_work(struct request_queue *q) +{ + cancel_work_sync(&q->requeue_work); +} +EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); + void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_schedule_work(&q->requeue_work); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); +void blk_mq_abort_requeue_list(struct request_queue *q) +{ + unsigned long flags; + LIST_HEAD(rq_list); + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + while (!list_empty(&rq_list)) { + struct request *rq; + + rq = list_first_entry(&rq_list, struct request, queuelist); + list_del_init(&rq->queuelist); + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } +} +EXPORT_SYMBOL(blk_mq_abort_requeue_list); + static inline bool is_flush_request(struct request *rq, struct blk_flush_queue *fq, unsigned int tag) { @@ -548,13 +619,24 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) break; } } - + static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + /* + * If a request wasn't started before the queue was + * marked dying, kill it here or it'll go unnoticed. + */ + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_complete_request(rq); + } + return; + } + if (rq->cmd_flags & REQ_NO_TIMEOUT) return; if (time_after_eq(jiffies, rq->deadline)) { @@ -581,7 +663,7 @@ static void blk_mq_rq_timer(unsigned long priv) * If not software queues are currently mapped to this * hardware queue, there's nothing to check */ - if (!hctx->nr_ctx || !hctx->tags) + if (!blk_mq_hw_queue_mapped(hctx)) continue; blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); @@ -680,6 +762,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct request *rq; LIST_HEAD(rq_list); + LIST_HEAD(driver_list); + struct list_head *dptr; int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -706,16 +790,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* + * Start off with dptr being NULL, so we start the first request + * immediately, even if we have more pending. + */ + dptr = NULL; + + /* * Now process all the entries, sending them to the driver. */ queued = 0; while (!list_empty(&rq_list)) { + struct blk_mq_queue_data bd; int ret; rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); + bd.rq = rq; + bd.list = dptr; + bd.last = list_empty(&rq_list); + + ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -734,6 +829,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; + + /* + * We've done the first request. If we have more than 1 + * left in the list, set dptr to defer issue. + */ + if (!dptr && rq_list.next != rq_list.prev) + dptr = &driver_list; } if (!queued) @@ -760,10 +862,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { - int cpu = hctx->next_cpu; + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; + int cpu = hctx->next_cpu, next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -771,26 +874,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; } - return cpu; + return hctx->next_cpu; } void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || + !blk_mq_hw_queue_mapped(hctx))) return; - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) - __blk_mq_run_hw_queue(hctx); - else if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->run_work, 0); - else { - unsigned int cpu; + if (!async) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + put_cpu(); + return; + } - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); + put_cpu(); } + + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work, 0); } void blk_mq_run_queues(struct request_queue *q, bool async) @@ -804,9 +913,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); @@ -833,9 +940,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, false); - preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -860,9 +965,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -888,16 +991,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - unsigned long tmo = msecs_to_jiffies(msecs); - - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->delay_work, tmo); - else { - unsigned int cpu; + if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + return; - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_queue); @@ -1152,7 +1250,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } - if (is_sync) { + /* + * If the driver supports defer issued based on 'last', then + * queue it up like normal since we can potentially save some + * CPU this way. + */ + if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; int ret; blk_mq_bio_to_request(rq, bio); @@ -1162,7 +1270,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq, true); + ret = q->mq_ops->queue_rq(data.hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { @@ -1533,10 +1641,8 @@ static void blk_mq_free_hw_queues(struct request_queue *q, struct blk_mq_hw_ctx *hctx; unsigned int i; - queue_for_each_hw_ctx(q, hctx, i) { + queue_for_each_hw_ctx(q, hctx, i) free_cpumask_var(hctx->cpumask); - kfree(hctx); - } } static int blk_mq_init_hctx(struct request_queue *q, @@ -1557,7 +1663,6 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->queue = q; hctx->queue_num = hctx_idx; hctx->flags = set->flags; - hctx->cmd_size = set->cmd_size; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1774,16 +1879,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); - /* - * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. - */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->queue_depth = min(64U, set->queue_depth); - } - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); @@ -1905,11 +2000,9 @@ void blk_mq_free_queue(struct request_queue *q) percpu_ref_exit(&q->mq_usage_counter); - free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); kfree(q->mq_map); - q->queue_ctx = NULL; q->queue_hw_ctx = NULL; q->mq_map = NULL; @@ -1921,7 +2014,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - blk_mq_freeze_queue(q); + WARN_ON_ONCE(!q->mq_freeze_depth); blk_mq_sysfs_unregister(q); @@ -1936,8 +2029,6 @@ static void blk_mq_queue_reinit(struct request_queue *q) blk_mq_map_swqueue(q); blk_mq_sysfs_register(q); - - blk_mq_unfreeze_queue(q); } static int blk_mq_queue_reinit_notify(struct notifier_block *nb, @@ -1956,8 +2047,25 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; mutex_lock(&all_q_mutex); + + /* + * We need to freeze and reinit all existing queues. Freezing + * involves synchronous wait for an RCU grace period and doing it + * one by one may take a long time. Start freezing all queues in + * one swoop and then wait for the completions so that freezing can + * take place in parallel. + */ + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_freeze_queue_start(q); + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_freeze_queue_wait(q); + list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_queue_reinit(q); + + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_unfreeze_queue(q); + mutex_unlock(&all_q_mutex); return NOTIFY_OK; } @@ -2024,6 +2132,8 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); + if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) @@ -2040,6 +2150,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); |