From d2bf1b6723ed0eab378363649d15b7893bf14e91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Dec 2010 20:57:36 +0100 Subject: block: move register_disk() and del_gendisk() to block/genhd.c There's no reason for register_disk() and del_gendisk() to be in fs/partitions/check.c. Move both to genhd.c. While at it, collapse unlink_gendisk(), which was artificially in a separate function due to genhd.c / check.c split, into del_gendisk(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4..83031bcf836 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -643,7 +643,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern void register_disk(struct gendisk *dev); extern void generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); -- cgit v1.2.3-70-g09d2 From 77ea887e433ad8389d416826936c110fa7910f80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Dec 2010 20:57:37 +0100 Subject: implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo Cc: Kay Sievers Cc: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 429 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/block_dev.c | 41 ++++- include/linux/blkdev.h | 3 + include/linux/fs.h | 1 + include/linux/genhd.h | 18 ++- 5 files changed, 484 insertions(+), 8 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/genhd.c b/block/genhd.c index 2e5e4c0a113..5465a824d48 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "blk.h" @@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_add_events(struct gendisk *disk); +static void disk_del_events(struct gendisk *disk); +static void disk_release_events(struct gendisk *disk); + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -609,6 +614,8 @@ void add_disk(struct gendisk *disk) retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); + + disk_add_events(disk); } EXPORT_SYMBOL(add_disk); @@ -617,6 +624,8 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + disk_del_events(disk); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); @@ -1089,6 +1098,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); @@ -1350,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno) } EXPORT_SYMBOL(invalidate_partition); + +/* + * Disk events - monitor disk events like media change and eject request. + */ +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs = 0; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll iff there are events which + * can't be monitored asynchronously. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->events & ~disk->async_events) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +static void __disk_block_events(struct gendisk *disk, bool sync) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) { + if (sync) + cancel_delayed_work_sync(&disk->ev->dwork); + else + cancel_delayed_work(&disk->ev->dwork); + } +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + /* + * Not exactly a latency critical operation, set poll timer + * slack to 25% and kick event check. + */ + intv = disk_events_poll_jiffies(disk); + set_timer_slack(&ev->dwork.timer, intv / 4); + if (check_now) + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_block_events(disk, true); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, true); +} + +/** + * disk_check_events - schedule immediate event checking + * @disk: disk to check events for + * + * Schedule immediate event checking on @disk if not blocked. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_check_events(struct gendisk *disk) +{ + if (disk->ev) { + __disk_block_events(disk, false); + __disk_unblock_events(disk, true); + } +} +EXPORT_SYMBOL_GPL(disk_check_events); + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and clearted + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + const struct block_device_operations *bdops = disk->fops; + struct disk_events *ev = disk->ev; + unsigned int pending; + + if (!ev) { + /* for drivers still using the old ->media_changed method */ + if ((mask & DISK_EVENT_MEDIA_CHANGE) && + bdops->media_changed && bdops->media_changed(disk)) + return DISK_EVENT_MEDIA_CHANGE; + return 0; + } + + /* tell the workfn about the events being cleared */ + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + spin_unlock_irq(&ev->lock); + + /* uncondtionally schedule event check and wait for it to finish */ + __disk_block_events(disk, true); + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + flush_delayed_work(&ev->dwork); + __disk_unblock_events(disk, false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + + return pending; +} + +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = ev->clearing; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + ev->clearing &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* tell userland about new events */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->async_events, buf); +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + __disk_block_events(disk, true); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + + return count; +} + +static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + + list_for_each_entry(ev, &disk_events, node) + disk_check_events(ev->disk); + + mutex_unlock(&disk_events_mutex); + + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{add|del|release}_events - initialize and destroy disk_events. + */ +static void disk_add_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !(disk->events | disk->async_events)) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + if (sysfs_create_files(&disk_to_dev(disk)->kobj, + disk_events_attrs) < 0) { + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + kfree(ev); + return; + } + + disk->ev = ev; + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + mutex_lock(&disk_events_mutex); + list_add_tail(&ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +static void disk_del_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + __disk_block_events(disk, true); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +static void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/fs/block_dev.c b/fs/block_dev.c index c1c1b8c3fb9..6017389711e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; const struct block_device_operations *bdops = disk->fops; + unsigned int events; - if (!bdops->media_changed) - return 0; - if (!bdops->media_changed(bdev->bd_disk)) + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; flush_disk(bdev); @@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) if (whole) { /* finish claiming */ + mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); - if (res == 0) { + if (!res) { BUG_ON(!bd_may_claim(bdev, whole, holder)); /* * Note that for a whole device bd_holders @@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) wake_up_bit(&whole->bd_claiming, 0); spin_unlock(&bdev_lock); + + /* + * Block event polling for write claims. Any write + * holder makes the write_holder state stick until all + * are released. This is good enough and tracking + * individual writeable reference is too fragile given + * the way @mode is used in blkdev_get/put(). + */ + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + bdev->bd_write_holder = true; + disk_block_events(bdev->bd_disk); + } + + mutex_unlock(&bdev->bd_mutex); bdput(whole); } @@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) spin_unlock(&bdev_lock); - /* if this was the last claim, holder link should go too */ - if (bdev_free) + /* + * If this was the last claim, remove holder link and + * unblock evpoll if it was a write holder. + */ + if (bdev_free) { bd_unlink_disk_holder(bdev); + if (bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } else + disk_check_events(bdev->bd_disk); + } mutex_unlock(&bdev->bd_mutex); - } + } else + disk_check_events(bdev->bd_disk); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83031bcf836..05667e6989f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1251,6 +1251,9 @@ struct block_device_operations { int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); + unsigned int (*check_events) (struct gendisk *disk, + unsigned int clearing); + /* ->media_changed() is DEPRECATED, use ->check_events() instead */ int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); diff --git a/include/linux/fs.h b/include/linux/fs.h index f4850156391..997d22efdef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -662,6 +662,7 @@ struct block_device { void * bd_claiming; void * bd_holder; int bd_holders; + bool bd_write_holder; #ifdef CONFIG_SYSFS struct gendisk * bd_holder_disk; /* for sysfs slave linkng */ #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 56e17ed2481..13893aa2ac9 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -127,6 +127,11 @@ struct hd_struct { #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ #define GENHD_FL_NATIVE_CAPACITY 128 +enum { + DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ + DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ +}; + #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) @@ -143,6 +148,8 @@ struct disk_part_tbl { struct hd_struct __rcu *part[]; }; +struct disk_events; + struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). @@ -154,6 +161,10 @@ struct gendisk { char disk_name[DISK_NAME_LEN]; /* name of major driver */ char *(*devnode)(struct gendisk *gd, mode_t *mode); + + unsigned int events; /* supported events */ + unsigned int async_events; /* async events, subset of all */ + /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other * non-critical accesses use RCU. Always access through @@ -171,8 +182,8 @@ struct gendisk { struct kobject *slave_dir; struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ + struct disk_events *ev; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif @@ -405,6 +416,11 @@ static inline int get_disk_ro(struct gendisk *disk) return disk->part0.policy; } +extern void disk_block_events(struct gendisk *disk); +extern void disk_unblock_events(struct gendisk *disk); +extern void disk_check_events(struct gendisk *disk); +extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); + /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk); extern void rand_initialize_disk(struct gendisk *disk); -- cgit v1.2.3-70-g09d2 From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 5 Jan 2011 16:57:38 +0100 Subject: block: fix accounting bug on cross partition merges /proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. Also add a refcount to struct hd_struct to keep the partition in memory as long as users exist. We use kref_test_and_get() to ensure we don't add a reference to a partition which is going away. Signed-off-by: Jerome Marchand Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 26 +++++++++++++++++++++----- block/blk-merge.c | 3 ++- block/genhd.c | 1 + fs/partitions/check.c | 10 +++++++++- include/linux/blkdev.h | 1 + include/linux/genhd.h | 2 ++ 6 files changed, 36 insertions(+), 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 3689319a597..500c080a6a6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!kref_test_and_get(&part->ref)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + kref_get(&part->ref); + } part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); + kref_put(&part->ref, __delete_partition); part_stat_unlock(); } } diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b..b06b83b89d8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); + kref_put(&part->ref, __delete_partition); part_stat_unlock(); } } diff --git a/block/genhd.c b/block/genhd.c index 16ccc0d2d5d..85c15059883 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) return NULL; } disk->part_tbl->part[0] = &disk->part0; + kref_init(&disk->part0.ref); disk->minors = minors; rand_initialize_disk(disk); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index bdf8d3cc95a..48209f58522 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -381,6 +381,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } +void __delete_partition(struct kref *ref) +{ + struct hd_struct *part = container_of(ref, struct hd_struct, ref); + + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; @@ -399,7 +406,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + kref_put(&part->ref, __delete_partition); } static ssize_t whole_disk_show(struct device *dev, @@ -498,6 +505,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); + kref_init(&p->ref); return p; out_free_info: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4..482a7fd4883 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -115,6 +115,7 @@ struct request { void *elevator_private3; struct gendisk *rq_disk; + struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP unsigned long long start_time_ns; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7a7b9c1644e..2ba2792a3dd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -116,6 +116,7 @@ struct hd_struct { struct disk_stats dkstats; #endif struct rcu_head rcu_head; + struct kref ref; }; #define GENHD_FL_REMOVABLE 1 @@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk, sector_t len, int flags, struct partition_meta_info *info); +extern void __delete_partition(struct kref *ref); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); -- cgit v1.2.3-70-g09d2