From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Aug 2014 23:16:32 +0800 Subject: blk-mq: fix WARNING "percpu_ref_kill() called more than once!" Before doing queue release, the queue has been freezed already by blk_cleanup_queue(), so needn't to freeze queue for deleting tag set. This patch fixes the WARNING of "percpu_ref_kill() called more than once!" which is triggered during unloading block driver. Cc: Tejun Heo Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478..ac8a0413664 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; - blk_mq_freeze_queue(q); - mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); blk_mq_update_tag_set_depth(set); mutex_unlock(&set->tag_list_lock); - - blk_mq_unfreeze_queue(q); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, -- cgit v1.2.3-70-g09d2 From 8a58d1f1f373238cb0d6d7f8d3dd723aa164b8ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:38:41 -0600 Subject: blk-mq: get rid of unused BLK_MQ_F_SHOULD_SORT flag We used to use this for determining whether to sort the dispatch list, but it's unused now. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index eb726b9c576..a1e31f274fc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -127,10 +127,9 @@ enum { BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ BLK_MQ_F_SHOULD_MERGE = 1 << 0, - BLK_MQ_F_SHOULD_SORT = 1 << 1, - BLK_MQ_F_TAG_SHARED = 1 << 2, - BLK_MQ_F_SG_MERGE = 1 << 3, - BLK_MQ_F_SYSFS_UP = 1 << 4, + BLK_MQ_F_TAG_SHARED = 1 << 1, + BLK_MQ_F_SG_MERGE = 1 << 2, + BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, -- cgit v1.2.3-70-g09d2 From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:44:08 -0600 Subject: blk-mq: don't allow merges if turned off for the queue blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time, to determine whether it should merge IO or not. However, this could also be disabled by the admin, if merging is switched off through sysfs. So check the general queue state as well before attempting to merge IO. Reported-by: Rob Elliott Tested-by: Rob Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ac8a0413664..c9e89a8792e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } +static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) +{ + return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !blk_queue_nomerges(hctx->queue); +} + static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - struct request_queue *q = hctx->queue; - - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + if (!hctx_allow_merges(hctx)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1082,6 +1086,8 @@ insert_rq: spin_unlock(&ctx->lock); return false; } else { + struct request_queue *q = hctx->queue; + spin_lock(&ctx->lock); if (!blk_mq_attempt_merge(q, ctx, bio)) { blk_mq_bio_to_request(rq, bio); -- cgit v1.2.3-70-g09d2 From 16f408dc6b1c87f5e3a767626df09c1399c6bf70 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Aug 2014 14:49:31 +0300 Subject: block: Fix BUG_ON when pi errors occur When getting a pi error we get to bio_integrity_end_io with bi_remaining already decremented to 0 where we will eventually need to call bio_endio with restored original bio completion handler. Calling bio_endio invokes a BUG_ON(). We should call bio_endio_nodec instead, like what is done in bio_integrity_verify_fn. Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bc423f7b02d..f14b4abbebd 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -520,7 +520,7 @@ void bio_integrity_endio(struct bio *bio, int error) */ if (error) { bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio_nodec(bio, error); return; } -- cgit v1.2.3-70-g09d2 From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 13:19:15 -0600 Subject: blk-mq: correct a few wrong/bad comments Just grammar or spelling errors, nothing major. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c9e89a8792e..a0565bb20fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->tags = set->tags[i]; /* - * Allocate space for all possible cpus to avoid allocation in + * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), @@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { /* - * If not software queues are mapped to this hardware queue, - * disable it and free the request entries + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. */ if (!hctx->nr_ctx) { struct blk_mq_tag_set *set = q->tag_set; -- cgit v1.2.3-70-g09d2 From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 16 Aug 2014 08:02:24 -0400 Subject: blk-mq: blk_mq_freeze_queue() should allow nesting While converting to percpu_ref for freezing, add703fda981 ("blk-mq: use percpu_ref for mq usage count") incorrectly made blk_mq_freeze_queue() misbehave when freezing is nested due to percpu_ref_kill() being invoked on an already killed ref. Fix it by making blk_mq_freeze_queue() kill and kick the queue only for the outermost freeze attempt. All the nested ones can simply wait for the ref to reach zero. While at it, remove unnecessary @wake initialization from blk_mq_unfreeze_queue(). Signed-off-by: Tejun Heo Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a0565bb20fd..7950f8d7c1b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) */ void blk_mq_freeze_queue(struct request_queue *q) { + bool freeze; + spin_lock_irq(q->queue_lock); - q->mq_freeze_depth++; + freeze = !q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); - percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + if (freeze) { + percpu_ref_kill(&q->mq_usage_counter); + blk_mq_run_queues(q, false); + } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } static void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake = false; + bool wake; spin_lock_irq(q->queue_lock); wake = !--q->mq_freeze_depth; -- cgit v1.2.3-70-g09d2 From ffb5db73ebe9b5c37fd741e1684833dd674372bd Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 13 Aug 2014 13:59:50 +0200 Subject: block: systemace: Remove .owner field for driver There is no need to init .owner field. Based on the patch from Peter Griffin "mmc: remove .owner field for drivers using module_platform_driver" This patch removes the superflous .owner field for drivers which use the module_platform_driver API, as this is overriden in platform_driver_register anyway." Signed-off-by: Michal Simek Signed-off-by: Jens Axboe --- drivers/block/xsysace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index ab3ea62e5df..c4328d9d998 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -1203,7 +1203,6 @@ static struct platform_driver ace_platform_driver = { .probe = ace_probe, .remove = ace_remove, .driver = { - .owner = THIS_MODULE, .name = "xsysace", .of_match_table = ace_of_match, }, -- cgit v1.2.3-70-g09d2 From aeac31819475ad0980cb3a13d5599f5a1127e83d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 18 Aug 2014 12:49:08 +0400 Subject: brd: add ram disk visibility option Currenly ram disk is not visiable inside /proc/partitions. This was done for compatibility reasons here: 53978d0a7a27. But some utilities expect disk presents in /proc/partitions. Let's add module's option and let's administrator chose visibility behaviour. By default, old behaviour preserved. Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- drivers/block/brd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c7d138eca73..3598110d2ce 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -442,12 +442,15 @@ static int rd_nr; int rd_size = CONFIG_BLK_DEV_RAM_SIZE; static int max_part; static int part_shift; +static int part_show = 0; module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); module_param(max_part, int, S_IRUGO); MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); +module_param(part_show, int, S_IRUGO); +MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -501,7 +504,8 @@ static struct brd_device *brd_alloc(int i) disk->fops = &brd_fops; disk->private_data = brd; disk->queue = brd->brd_queue; - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + if (!part_show) + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); -- cgit v1.2.3-70-g09d2 From 2cada584b20007470931080c49310d85908449b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Aug 2014 20:38:27 -0500 Subject: block: cleanup error handling in sg_io Make sure we always clean up through the out label and just have a single place to put the request. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 51bf5155ee7..ba8706f6cde 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -279,7 +279,6 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, r = blk_rq_unmap_user(bio); if (!ret) ret = r; - blk_put_request(rq); return ret; } @@ -322,10 +321,9 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, return -ENOMEM; blk_rq_set_block_pc(rq); - if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { - blk_put_request(rq); - return -EFAULT; - } + ret = -EFAULT; + if (blk_fill_sghdr_rq(q, rq, hdr, mode)) + goto out; if (hdr->iovec_count) { size_t iov_data_len; @@ -376,7 +374,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, hdr->duration = jiffies_to_msecs(jiffies - start_time); - return blk_complete_sghdr_rq(rq, hdr, bio); + ret = blk_complete_sghdr_rq(rq, hdr, bio); out: blk_put_request(rq); return ret; -- cgit v1.2.3-70-g09d2 From a57821cac6bb6e46abea118e34d0e86444ec1410 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Aug 2014 20:39:53 -0500 Subject: block: support > 16 byte CDBs for SG_IO Signed-off-by: Christoph Hellwig Reviewed-by: Boaz Harrosh Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index ba8706f6cde..1d78e6cf9d6 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -296,8 +296,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->interface_id != 'S') return -EINVAL; - if (hdr->cmd_len > BLK_MAX_CDB) - return -EINVAL; if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) return -EIO; @@ -316,14 +314,21 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->flags & SG_FLAG_Q_AT_HEAD) at_head = 1; + ret = -ENOMEM; rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) - return -ENOMEM; + goto out; blk_rq_set_block_pc(rq); + if (hdr->cmd_len > BLK_MAX_CDB) { + rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); + if (!rq->cmd) + goto out_put_request; + } + ret = -EFAULT; if (blk_fill_sghdr_rq(q, rq, hdr, mode)) - goto out; + goto out_free_cdb; if (hdr->iovec_count) { size_t iov_data_len; @@ -333,7 +338,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, 0, NULL, &iov); if (ret < 0) { kfree(iov); - goto out; + goto out_free_cdb; } iov_data_len = ret; @@ -356,7 +361,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, GFP_KERNEL); if (ret) - goto out; + goto out_free_cdb; bio = rq->bio; memset(sense, 0, sizeof(sense)); @@ -375,8 +380,13 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, hdr->duration = jiffies_to_msecs(jiffies - start_time); ret = blk_complete_sghdr_rq(rq, hdr, bio); -out: + +out_free_cdb: + if (rq->cmd != rq->__cmd) + kfree(rq->cmd); +out_put_request: blk_put_request(rq); +out: return ret; } -- cgit v1.2.3-70-g09d2 From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:39 -0400 Subject: scsi-mq: fix requests that use a separate CDB buffer This patch fixes code such as the following with scsi-mq enabled: rq = blk_get_request(...); blk_rq_set_block_pc(rq); rq->cmd = my_cmd_buffer; /* separate CDB buffer */ blk_execute_rq_nowait(...); Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for large CDBs only). Without this patch, scsi_mq_prep_fn() will set rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device. Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-mq.c | 2 ++ drivers/scsi/scsi_lib.c | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index c359d72e9d7..bf930f481d4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1252,7 +1252,6 @@ void blk_rq_set_block_pc(struct request *rq) rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); - rq->cmd = rq->__cmd; } EXPORT_SYMBOL(blk_rq_set_block_pc); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7950f8d7c1b..4aac82615a4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, /* tag was already set */ rq->errors = 0; + rq->cmd = rq->__cmd; + rq->extra_len = 0; rq->sense_len = 0; rq->resid_len = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9c44392b748..d86808f051e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1808,7 +1808,6 @@ static int scsi_mq_prep_fn(struct request *req) cmd->tag = req->tag; - req->cmd = req->__cmd; cmd->cmnd = req->cmd; cmd->prot_op = SCSI_PROT_NORMAL; -- cgit v1.2.3-70-g09d2 From 2ba136daa3ae1e881c9f586f283fcaa164767dce Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:35 -0400 Subject: fix regression in SCSI_IOCTL_SEND_COMMAND blk_rq_set_block_pc() memsets rq->cmd to 0, so it should come immediately after blk_get_request() to avoid overwriting the user-supplied CDB. Also check for failure to allocate rq. Fixes: f27b087b81b7 ("block: add blk_rq_set_block_pc()") Cc: # 3.16.x Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 1d78e6cf9d6..5dd477bfb4b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -456,6 +456,11 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + if (!rq) { + err = -ENOMEM; + goto error; + } + blk_rq_set_block_pc(rq); cmdlen = COMMAND_SIZE(opcode); @@ -509,7 +514,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - blk_rq_set_block_pc(rq); blk_execute_rq(q, disk, rq, 0); @@ -529,7 +533,8 @@ out: error: kfree(buffer); - blk_put_request(rq); + if (rq) + blk_put_request(rq); return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); -- cgit v1.2.3-70-g09d2 From d19d744685f47f1bb3d39b3ec51eb50afe5ff47d Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Aug 2014 16:14:02 +0200 Subject: block: fix error handling in sg_io Before commit 2cada584b200 ("block: cleanup error handling in sg_io"), we had ret = 0 before entering the last big if block of sg_io. Since 2cada584b200, ret = -EFAULT, which breaks hdparm: /dev/sda: setting Advanced Power Management level to 0xc8 (200) HDIO_DRIVE_CMD failed: Bad address APM_level = 128 Signed-off-by: Sabrina Dubroca Fixes: 2cada584b200 ("block: cleanup error handling in sg_io") Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 5dd477bfb4b..9b8eaeca6a7 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -330,6 +330,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (blk_fill_sghdr_rq(q, rq, hdr, mode)) goto out_free_cdb; + ret = 0; if (hdr->iovec_count) { size_t iov_data_len; struct iovec *iov = NULL; -- cgit v1.2.3-70-g09d2 From e15693ef18e13e3e6bffe891fe140f18b8ff6d07 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 26 Aug 2014 20:56:36 +0900 Subject: cfq-iosched: Fix wrong children_weight calculation cfq_group_service_tree_add() is applying new_weight at the beginning of the function via cfq_update_group_weight(). This actually allows weight to change between adding it to and subtracting it from children_weight, and triggers WARN_ON_ONCE() in cfq_group_service_tree_del(), or even causes oops by divide error during vfr calculation in cfq_group_service_tree_add(). The detailed scenario is as follows: 1. Create blkio cgroups X and Y as a child of X. Set X's weight to 500 and perform some I/O to apply new_weight. This X's I/O completes before starting Y's I/O. 2. Y starts I/O and cfq_group_service_tree_add() is called with Y. 3. cfq_group_service_tree_add() walks up the tree during children_weight calculation and adds parent X's weight (500) to children_weight of root. children_weight becomes 500. 4. Set X's weight to 1000. 5. X starts I/O and cfq_group_service_tree_add() is called with X. 6. cfq_group_service_tree_add() applies its new_weight (1000). 7. I/O of Y completes and cfq_group_service_tree_del() is called with Y. 8. I/O of X completes and cfq_group_service_tree_del() is called with X. 9. cfq_group_service_tree_del() subtracts X's weight (1000) from children_weight of root. children_weight becomes -500. This triggers WARN_ON_ONCE(). 10. Set X's weight to 500. 11. X starts I/O and cfq_group_service_tree_add() is called with X. 12. cfq_group_service_tree_add() applies its new_weight (500) and adds it to children_weight of root. children_weight becomes 0. Calcularion of vfr triggers oops by divide error. weight should be updated right before adding it to children_weight. Reported-by: Ruki Sekiya Signed-off-by: Toshiaki Makita Acked-by: Tejun Heo Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cadc3784174..d7494637c5d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1275,12 +1275,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) static void cfq_update_group_weight(struct cfq_group *cfqg) { - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } +} + +static void +cfq_update_group_leaf_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); if (cfqg->new_leaf_weight) { cfqg->leaf_weight = cfqg->new_leaf_weight; @@ -1299,7 +1303,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - cfq_update_group_weight(cfqg); + cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); /* @@ -1323,6 +1327,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) */ while ((parent = cfqg_parent(pos))) { if (propagate) { + cfq_update_group_weight(pos); propagate = !parent->nr_active++; parent->children_weight += pos->weight; } -- cgit v1.2.3-70-g09d2 From 7b5af5cffce569298b1d03af1ddf1dc43570ab42 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 28 Aug 2014 17:14:58 +0900 Subject: cfq-iosched: Add comments on update timing of weight Explain that weight has to be updated on activation. This complements previous fix e15693ef18e1 ("cfq-iosched: Fix wrong children_weight calculation"). Signed-off-by: Toshiaki Makita Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d7494637c5d..3f31cf9508e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1272,6 +1272,9 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) rb_insert_color(&cfqg->rb_node, &st->rb); } +/* + * This has to be called only on activation of cfqg + */ static void cfq_update_group_weight(struct cfq_group *cfqg) { @@ -1303,6 +1306,11 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + /* + * Update leaf_weight. We cannot update weight at this point + * because cfqg might already have been activated and is + * contributing its current weight to the parent's child_weight. + */ cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); -- cgit v1.2.3-70-g09d2