diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-14 12:08:14 +0900 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-14 12:08:14 +0900 |
commit | 0910c0bdf7c291a41bc21e40a97389c9d4c1960d (patch) | |
tree | 177c4cb22ece78b18f64f548ae82b9a15edbb99c /block/blk-core.c | |
parent | 2821fe6b00a1e902fd399bb4b7e40bc3041f4d44 (diff) | |
parent | e37459b8e2c7db6735e39e019e448b76e5e77647 (diff) |
Merge branch 'for-3.13/core' of git://git.kernel.dk/linux-block
Pull block IO core updates from Jens Axboe:
"This is the pull request for the core changes in the block layer for
3.13. It contains:
- The new blk-mq request interface.
This is a new and more scalable queueing model that marries the
best part of the request based interface we currently have (which
is fully featured, but scales poorly) and the bio based "interface"
which the new drivers for high IOPS devices end up using because
it's much faster than the request based one.
The bio interface has no block layer support, since it taps into
the stack much earlier. This means that drivers end up having to
implement a lot of functionality on their own, like tagging,
timeout handling, requeue, etc. The blk-mq interface provides all
these. Some drivers even provide a switch to select bio or rq and
has code to handle both, since things like merging only works in
the rq model and hence is faster for some workloads. This is a
huge mess. Conversion of these drivers nets us a substantial code
reduction. Initial results on converting SCSI to this model even
shows an 8x improvement on single queue devices. So while the
model was intended to work on the newer multiqueue devices, it has
substantial improvements for "classic" hardware as well. This code
has gone through extensive testing and development, it's now ready
to go. A pull request is coming to convert virtio-blk to this
model will be will be coming as well, with more drivers scheduled
for 3.14 conversion.
- Two blktrace fixes from Jan and Chen Gang.
- A plug merge fix from Alireza Haghdoost.
- Conversion of __get_cpu_var() from Christoph Lameter.
- Fix for sector_div() with 64-bit divider from Geert Uytterhoeven.
- A fix for a race between request completion and the timeout
handling from Jeff Moyer. This is what caused the merge conflict
with blk-mq/core, in case you are looking at that.
- A dm stacking fix from Mike Snitzer.
- A code consolidation fix and duplicated code removal from Kent
Overstreet.
- A handful of block bug fixes from Mikulas Patocka, fixing a loop
crash and memory corruption on blk cg.
- Elevator switch bug fix from Tomoki Sekiyama.
A heads-up that I had to rebase this branch. Initially the immutable
bio_vecs had been queued up for inclusion, but a week later, it became
clear that it wasn't fully cooked yet. So the decision was made to
pull this out and postpone it until 3.14. It was a straight forward
rebase, just pruning out the immutable series and the later fixes of
problems with it. The rest of the patches applied directly and no
further changes were made"
* 'for-3.13/core' of git://git.kernel.dk/linux-block: (31 commits)
block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO
block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO
block: Do not call sector_div() with a 64-bit divisor
kernel: trace: blktrace: remove redundent memcpy() in compat_blk_trace_setup()
block: Consolidate duplicated bio_trim() implementations
block: Use rw_copy_check_uvector()
block: Enable sysfs nomerge control for I/O requests in the plug list
block: properly stack underlying max_segment_size to DM device
elevator: acquire q->sysfs_lock in elevator_change()
elevator: Fix a race in elevator switching and md device initialization
block: Replace __get_cpu_var uses
bdi: test bdi_init failure
block: fix a probe argument to blk_register_region
loop: fix crash if blk_alloc_queue fails
blk-core: Fix memory corruption if blkcg_init_queue fails
block: fix race between request completion and timeout handling
blktrace: Send BLK_TN_PROCESS events to all running traces
blk-mq: don't disallow request merges for req->special being set
blk-mq: mq plug list breakage
blk-mq: fix for flush deadlock
...
Diffstat (limited to 'block/blk-core.c')
-rw-r--r-- | block/blk-core.c | 175 |
1 files changed, 109 insertions, 66 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 0a00e4ecf87..8bdd0121212 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/kernel_stat.h> @@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -static struct kmem_cache *request_cachep; +struct kmem_cache *request_cachep = NULL; /* * For queue allocation @@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; -static void drive_stat_acct(struct request *rq, int new_io) -{ - struct hd_struct *part; - int rw = rq_data_dir(rq); - int cpu; - - if (!blk_do_io_stat(rq)) - return; - - cpu = part_stat_lock(); - - if (!new_io) { - part = rq->part; - part_stat_inc(cpu, part, merges[rw]); - } else { - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!hd_struct_try_get(part)) { - /* - * The partition is already being removed, - * the request will be accounted on the disk only - * - * We take a reference on disk->part0 although that - * partition will never be deleted, so we can treat - * it as any other partition. - */ - part = &rq->rq_disk->part0; - hd_struct_get(part); - } - part_round_stats(cpu, part); - part_inc_in_flight(part, rw); - rq->part = part; - } - - part_stat_unlock(); -} - void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; - rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; - printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, + printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, - rq->cmd_flags); + (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), @@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; + if (percpu_counter_init(&q->mq_usage_counter, 0)) + goto fail_q; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_q; + goto fail_c; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->bypass_depth = 1; __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); + init_waitqueue_head(&q->mq_freeze_wq); + if (blkcg_init_queue(q)) - goto fail_id; + goto fail_bdi; return q; +fail_bdi: + bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_c: + percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, q->sg_reserved_size = INT_MAX; + /* Protect q->elevator from elevator_change */ + mutex_lock(&q->sysfs_lock); + /* init elevator */ - if (elevator_init(q, NULL)) + if (elevator_init(q, NULL)) { + mutex_unlock(&q->sysfs_lock); return NULL; + } + + mutex_unlock(&q->sysfs_lock); + return q; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1109,7 +1090,8 @@ retry: goto retry; } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, int rw, + gfp_t gfp_mask) { struct request *rq; @@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) return rq; } + +struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +{ + if (q->mq_ops) + return blk_mq_alloc_request(q, rw, gfp_mask, false); + else + return blk_old_get_request(q, rw, gfp_mask); +} EXPORT_SYMBOL(blk_get_request); /** @@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request); static void add_acct_request(struct request_queue *q, struct request *rq, int where) { - drive_stat_acct(rq, 1); + blk_account_io_start(rq, true); __elv_add_request(q, rq, where); } @@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req) { if (unlikely(!q)) return; - if (unlikely(--req->ref_count)) - return; blk_pm_put_request(req); @@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { - unsigned long flags; struct request_queue *q = req->q; - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); + if (q->mq_ops) + blk_mq_free_request(req); + else { + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_put_request(q, req); + spin_unlock_irqrestore(q->queue_lock, flags); + } } EXPORT_SYMBOL(blk_put_request); @@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page, } EXPORT_SYMBOL_GPL(blk_add_request_payload); -static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, - struct bio *bio) +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->__data_len += bio->bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); - drive_stat_acct(req, 0); + blk_account_io_start(req, false); return true; } -static bool bio_attempt_front_merge(struct request_queue *q, - struct request *req, struct bio *bio) +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, + struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->__data_len += bio->bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); - drive_stat_acct(req, 0); + blk_account_io_start(req, false); return true; } /** - * attempt_plug_merge - try to merge with %current's plugged list + * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests @@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q, * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. */ -static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count) { struct blk_plug *plug; struct request *rq; bool ret = false; + struct list_head *plug_list; + + if (blk_queue_nomerges(q)) + goto out; plug = current->plug; if (!plug) goto out; *request_count = 0; - list_for_each_entry_reverse(rq, &plug->list, queuelist) { + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; if (rq->q == q) @@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(q, bio, &request_count)) + if (blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); @@ -1560,7 +1562,7 @@ get_rq: } } list_add_tail(&req->queuelist, &plug->list); - drive_stat_acct(req, 1); + blk_account_io_start(req, true); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -2014,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void blk_account_io_completion(struct request *req, unsigned int bytes) +void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); @@ -2028,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) } } -static void blk_account_io_done(struct request *req) +void blk_account_io_done(struct request *req) { /* * Account IO completion. flush_rq isn't accounted as a @@ -2076,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q, } #endif +void blk_account_io_start(struct request *rq, bool new_io) +{ + struct hd_struct *part; + int rw = rq_data_dir(rq); + int cpu; + + if (!blk_do_io_stat(rq)) + return; + + cpu = part_stat_lock(); + + if (!new_io) { + part = rq->part; + part_stat_inc(cpu, part, merges[rw]); + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!hd_struct_try_get(part)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + hd_struct_get(part); + } + part_round_stats(cpu, part); + part_inc_in_flight(part, rw); + rq->part = part; + } + + part_stat_unlock(); +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -2227,6 +2265,7 @@ void blk_start_request(struct request *req) if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); + BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); @@ -2451,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error) if (req->cmd_flags & REQ_DONTPREP) blk_unprep_request(req); - blk_account_io_done(req); if (req->end_io) @@ -2873,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); + INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); /* @@ -2970,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(plug->magic != PLUG_MAGIC); flush_plug_callbacks(plug, from_schedule); + + if (!list_empty(&plug->mq_list)) + blk_mq_flush_plug_list(plug, from_schedule); + if (list_empty(&plug->list)) return; |