diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 22 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/as-iosched.c | 144 | ||||
-rw-r--r-- | block/blktrace.c | 538 | ||||
-rw-r--r-- | block/cfq-iosched.c | 625 | ||||
-rw-r--r-- | block/deadline-iosched.c | 116 | ||||
-rw-r--r-- | block/elevator.c | 180 | ||||
-rw-r--r-- | block/genhd.c | 130 | ||||
-rw-r--r-- | block/ioctl.c | 28 | ||||
-rw-r--r-- | block/ll_rw_blk.c | 188 |
10 files changed, 1267 insertions, 706 deletions
diff --git a/block/Kconfig b/block/Kconfig index 377f6dd20e1..b6f5f0a7965 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -11,4 +11,26 @@ config LBD your machine, or if you want to have a raid or loopback device bigger than 2TB. Otherwise say N. +config BLK_DEV_IO_TRACE + bool "Support for tracing block io actions" + depends on SYSFS + select RELAY + select DEBUG_FS + help + Say Y here, if you want to be able to trace the block layer actions + on a given queue. Tracing allows you to see any traffic happening + on a block device queue. For more information (and the user space + support tools needed), fetch the blktrace app from: + + git://brick.kernel.dk/data/git/blktrace.git + +config LSF + bool "Support for Large Single Files" + depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML + help + Say Y here if you want to be able to handle very large files (bigger + than 2TB), otherwise say N. + + If unsure, say Y. + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 7e4f93e2b44..c05de0e0037 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o + +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o diff --git a/block/as-iosched.c b/block/as-iosched.c index 8da3cf66894..296708ceceb 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -182,6 +182,9 @@ struct as_rq { static kmem_cache_t *arq_pool; +static atomic_t ioc_count = ATOMIC_INIT(0); +static struct completion *ioc_gone; + static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq); static void as_antic_stop(struct as_data *ad); @@ -193,6 +196,15 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); + if (atomic_dec_and_test(&ioc_count) && ioc_gone) + complete(ioc_gone); +} + +static void as_trim(struct io_context *ioc) +{ + if (ioc->aic) + free_as_io_context(ioc->aic); + ioc->aic = NULL; } /* Called when the task exits */ @@ -220,6 +232,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; + atomic_inc(&ioc_count); } return ret; @@ -1696,11 +1709,6 @@ static int as_init_queue(request_queue_t *q, elevator_t *e) /* * sysfs parts below */ -struct as_fs_entry { - struct attribute attr; - ssize_t (*show)(struct as_data *, char *); - ssize_t (*store)(struct as_data *, const char *, size_t); -}; static ssize_t as_var_show(unsigned int var, char *page) @@ -1717,8 +1725,9 @@ as_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t as_est_show(struct as_data *ad, char *page) +static ssize_t est_time_show(elevator_t *e, char *page) { + struct as_data *ad = e->elevator_data; int pos = 0; pos += sprintf(page+pos, "%lu %% exit probability\n", @@ -1734,21 +1743,23 @@ static ssize_t as_est_show(struct as_data *ad, char *page) } #define SHOW_FUNCTION(__FUNC, __VAR) \ -static ssize_t __FUNC(struct as_data *ad, char *page) \ +static ssize_t __FUNC(elevator_t *e, char *page) \ { \ + struct as_data *ad = e->elevator_data; \ return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ } -SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]); -SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]); -SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire); -SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]); -SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]); +SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]); +SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]); +SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); +SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]); +SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \ +static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ { \ - int ret = as_var_store(__PTR, (page), count); \ + struct as_data *ad = e->elevator_data; \ + int ret = as_var_store(__PTR, (page), count); \ if (*(__PTR) < (MIN)) \ *(__PTR) = (MIN); \ else if (*(__PTR) > (MAX)) \ @@ -1756,90 +1767,26 @@ static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \ *(__PTR) = msecs_to_jiffies(*(__PTR)); \ return ret; \ } -STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); -STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); -STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX); -STORE_FUNCTION(as_read_batchexpire_store, +STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); +STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); +STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); +STORE_FUNCTION(as_read_batch_expire_store, &ad->batch_expire[REQ_SYNC], 0, INT_MAX); -STORE_FUNCTION(as_write_batchexpire_store, +STORE_FUNCTION(as_write_batch_expire_store, &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); #undef STORE_FUNCTION -static struct as_fs_entry as_est_entry = { - .attr = {.name = "est_time", .mode = S_IRUGO }, - .show = as_est_show, -}; -static struct as_fs_entry as_readexpire_entry = { - .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, - .show = as_readexpire_show, - .store = as_readexpire_store, -}; -static struct as_fs_entry as_writeexpire_entry = { - .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, - .show = as_writeexpire_show, - .store = as_writeexpire_store, -}; -static struct as_fs_entry as_anticexpire_entry = { - .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR }, - .show = as_anticexpire_show, - .store = as_anticexpire_store, -}; -static struct as_fs_entry as_read_batchexpire_entry = { - .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR }, - .show = as_read_batchexpire_show, - .store = as_read_batchexpire_store, -}; -static struct as_fs_entry as_write_batchexpire_entry = { - .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR }, - .show = as_write_batchexpire_show, - .store = as_write_batchexpire_store, -}; - -static struct attribute *default_attrs[] = { - &as_est_entry.attr, - &as_readexpire_entry.attr, - &as_writeexpire_entry.attr, - &as_anticexpire_entry.attr, - &as_read_batchexpire_entry.attr, - &as_write_batchexpire_entry.attr, - NULL, -}; - -#define to_as(atr) container_of((atr), struct as_fs_entry, attr) - -static ssize_t -as_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - elevator_t *e = container_of(kobj, elevator_t, kobj); - struct as_fs_entry *entry = to_as(attr); - - if (!entry->show) - return -EIO; - - return entry->show(e->elevator_data, page); -} - -static ssize_t -as_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - elevator_t *e = container_of(kobj, elevator_t, kobj); - struct as_fs_entry *entry = to_as(attr); - - if (!entry->store) - return -EIO; - - return entry->store(e->elevator_data, page, length); -} - -static struct sysfs_ops as_sysfs_ops = { - .show = as_attr_show, - .store = as_attr_store, -}; - -static struct kobj_type as_ktype = { - .sysfs_ops = &as_sysfs_ops, - .default_attrs = default_attrs, +#define AS_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store) + +static struct elv_fs_entry as_attrs[] = { + __ATTR_RO(est_time), + AS_ATTR(read_expire), + AS_ATTR(write_expire), + AS_ATTR(antic_expire), + AS_ATTR(read_batch_expire), + AS_ATTR(write_batch_expire), + __ATTR_NULL }; static struct elevator_type iosched_as = { @@ -1860,9 +1807,10 @@ static struct elevator_type iosched_as = { .elevator_may_queue_fn = as_may_queue, .elevator_init_fn = as_init_queue, .elevator_exit_fn = as_exit_queue, + .trim = as_trim, }, - .elevator_ktype = &as_ktype, + .elevator_attrs = as_attrs, .elevator_name = "anticipatory", .elevator_owner = THIS_MODULE, }; @@ -1893,7 +1841,13 @@ static int __init as_init(void) static void __exit as_exit(void) { + DECLARE_COMPLETION(all_gone); elv_unregister(&iosched_as); + ioc_gone = &all_gone; + barrier(); + if (atomic_read(&ioc_count)) + complete(ioc_gone); + synchronize_rcu(); kmem_cache_destroy(arq_pool); } diff --git a/block/blktrace.c b/block/blktrace.c new file mode 100644 index 00000000000..36f3a172275 --- /dev/null +++ b/block/blktrace.c @@ -0,0 +1,538 @@ +/* + * Copyright (C) 2006 Jens Axboe <axboe@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/blktrace_api.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/debugfs.h> +#include <asm/uaccess.h> + +static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, }; +static unsigned int blktrace_seq __read_mostly = 1; + +/* + * Send out a notify for this process, if we haven't done so since a trace + * started + */ +static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +{ + struct blk_io_trace *t; + + t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm)); + if (t) { + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->device = bt->dev; + t->action = BLK_TC_ACT(BLK_TC_NOTIFY); + t->pid = tsk->pid; + t->cpu = smp_processor_id(); + t->pdu_len = sizeof(tsk->comm); + memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len); + tsk->btrace_seq = blktrace_seq; + } +} + +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, + pid_t pid) +{ + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) + return 1; + if (sector < bt->start_lba || sector > bt->end_lba) + return 1; + if (bt->pid && pid != bt->pid) + return 1; + + return 0; +} + +/* + * Data direction bit lookup + */ +static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; + +/* + * Bio action bits of interest + */ +static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) }; + +/* + * More could be added as needed, taking care to increment the decrementer + * to get correct indexing + */ +#define trace_barrier_bit(rw) \ + (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0)) +#define trace_sync_bit(rw) \ + (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1)) + +/* + * The worker for the various blk_add_trace*() types. Fills out a + * blk_io_trace structure and places it in a per-cpu subbuffer. + */ +void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + int rw, u32 what, int error, int pdu_len, void *pdu_data) +{ + struct task_struct *tsk = current; + struct blk_io_trace *t; + unsigned long flags; + unsigned long *sequence; + pid_t pid; + int cpu; + + if (unlikely(bt->trace_state != Blktrace_running)) + return; + + what |= ddir_act[rw & WRITE]; + what |= bio_act[trace_barrier_bit(rw)]; + what |= bio_act[trace_sync_bit(rw)]; + + pid = tsk->pid; + if (unlikely(act_log_check(bt, what, sector, pid))) + return; + + /* + * A word about the locking here - we disable interrupts to reserve + * some space in the relay per-cpu buffer, to prevent an irq + * from coming in and stepping on our toes. Once reserved, it's + * enough to get preemption disabled to prevent read of this data + * before we are through filling it. get_cpu()/put_cpu() does this + * for us + */ + local_irq_save(flags); + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(bt, tsk); + + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + if (t) { + cpu = smp_processor_id(); + sequence = per_cpu_ptr(bt->sequence, cpu); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = ++(*sequence); + t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu); + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->pid = pid; + t->device = bt->dev; + t->cpu = cpu; + t->error = error; + t->pdu_len = pdu_len; + + if (pdu_len) + memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(__blk_add_trace); + +static struct dentry *blk_tree_root; +static struct mutex blk_tree_mutex; +static unsigned int root_users; + +static inline void blk_remove_root(void) +{ + if (blk_tree_root) { + debugfs_remove(blk_tree_root); + blk_tree_root = NULL; + } +} + +static void blk_remove_tree(struct dentry *dir) +{ + mutex_lock(&blk_tree_mutex); + debugfs_remove(dir); + if (--root_users == 0) + blk_remove_root(); + mutex_unlock(&blk_tree_mutex); +} + +static struct dentry *blk_create_tree(const char *blk_name) +{ + struct dentry *dir = NULL; + + mutex_lock(&blk_tree_mutex); + + if (!blk_tree_root) { + blk_tree_root = debugfs_create_dir("block", NULL); + if (!blk_tree_root) + goto err; + } + + dir = debugfs_create_dir(blk_name, blk_tree_root); + if (dir) + root_users++; + else + blk_remove_root(); + +err: + mutex_unlock(&blk_tree_mutex); + return dir; +} + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + relay_close(bt->rchan); + debugfs_remove(bt->dropped_file); + blk_remove_tree(bt->dir); + free_percpu(bt->sequence); + kfree(bt); +} + +static int blk_trace_remove(request_queue_t *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (!bt) + return -EINVAL; + + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) + blk_trace_cleanup(bt); + + return 0; +} + +static int blk_dropped_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->u.generic_ip; + + return 0; +} + +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct blk_trace *bt = filp->private_data; + char buf[16]; + + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static struct file_operations blk_dropped_fops = { + .owner = THIS_MODULE, + .open = blk_dropped_open, + .read = blk_dropped_read, +}; + +/* + * Keep track of how many times we encountered a full subbuffer, to aid + * the user space app in telling how many lost events there were. + */ +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct blk_trace *bt; + + if (!relay_buf_full(buf)) + return 1; + + bt = buf->chan->private_data; + atomic_inc(&bt->dropped); + return 0; +} + +static int blk_remove_buf_file_callback(struct dentry *dentry) +{ + debugfs_remove(dentry); + return 0; +} + +static struct dentry *blk_create_buf_file_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static struct rchan_callbacks blk_relay_callbacks = { + .subbuf_start = blk_subbuf_start_callback, + .create_buf_file = blk_create_buf_file_callback, + .remove_buf_file = blk_remove_buf_file_callback, +}; + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup(request_queue_t *q, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct blk_trace *old_bt, *bt = NULL; + struct dentry *dir = NULL; + char b[BDEVNAME_SIZE]; + int ret, i; + + if (copy_from_user(&buts, arg, sizeof(buts))) + return -EFAULT; + + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + + strcpy(buts.name, bdevname(bdev, b)); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + for (i = 0; i < strlen(buts.name); i++) + if (buts.name[i] == '/') + buts.name[i] = '_'; + + if (copy_to_user(arg, &buts, sizeof(buts))) + return -EFAULT; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->sequence = alloc_percpu(unsigned long); + if (!bt->sequence) + goto err; + + ret = -ENOENT; + dir = blk_create_tree(buts.name); + if (!dir) + goto err; + + bt->dir = dir; + bt->dev = bdev->bd_dev; + atomic_set(&bt->dropped, 0); + + ret = -EIO; + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); + if (!bt->dropped_file) + goto err; + + bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks); + if (!bt->rchan) + goto err; + bt->rchan->private_data = bt; + + bt->act_mask = buts.act_mask; + if (!bt->act_mask) + bt->act_mask = (u16) -1; + + bt->start_lba = buts.start_lba; + bt->end_lba = buts.end_lba; + if (!bt->end_lba) + bt->end_lba = -1ULL; + + bt->pid = buts.pid; + bt->trace_state = Blktrace_setup; + + ret = -EBUSY; + old_bt = xchg(&q->blk_trace, bt); + if (old_bt) { + (void) xchg(&q->blk_trace, old_bt); + goto err; + } + + return 0; +err: + if (dir) + blk_remove_tree(dir); + if (bt) { + if (bt->dropped_file) + debugfs_remove(bt->dropped_file); + if (bt->sequence) + free_percpu(bt->sequence); + if (bt->rchan) + relay_close(bt->rchan); + kfree(bt); + } + return ret; +} + +static int blk_trace_startstop(request_queue_t *q, int start) +{ + struct blk_trace *bt; + int ret; + + if ((bt = q->blk_trace) == NULL) + return -EINVAL; + + /* + * For starting a trace, we can transition from a setup or stopped + * trace. For stopping a trace, the state must be running + */ + ret = -EINVAL; + if (start) { + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) { + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + ret = 0; + } + } else { + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + relay_flush(bt->rchan); + ret = 0; + } + } + + return ret; +} + +/** + * blk_trace_ioctl: - handle the ioctls associated with tracing + * @bdev: the block device + * @cmd: the ioctl cmd + * @arg: the argument data, if any + * + **/ +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +{ + request_queue_t *q; + int ret, start = 0; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + mutex_lock(&bdev->bd_mutex); + + switch (cmd) { + case BLKTRACESETUP: + ret = blk_trace_setup(q, bdev, arg); + break; + case BLKTRACESTART: + start = 1; + case BLKTRACESTOP: + ret = blk_trace_startstop(q, start); + break; + case BLKTRACETEARDOWN: + ret = blk_trace_remove(q); + break; + default: + ret = -ENOTTY; + break; + } + + mutex_unlock(&bdev->bd_mutex); + return ret; +} + +/** + * blk_trace_shutdown: - stop and cleanup trace structures + * @q: the request queue associated with the device + * + **/ +void blk_trace_shutdown(request_queue_t *q) +{ + blk_trace_startstop(q, 0); + blk_trace_remove(q); +} + +/* + * Average offset over two calls to sched_clock() with a gettimeofday() + * in the middle + */ +static void blk_check_time(unsigned long long *t) +{ + unsigned long long a, b; + struct timeval tv; + + a = sched_clock(); + do_gettimeofday(&tv); + b = sched_clock(); + + *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000; + *t -= (a + b) / 2; +} + +static void blk_trace_check_cpu_time(void *data) +{ + unsigned long long *t; + int cpu = get_cpu(); + + t = &per_cpu(blk_trace_cpu_offset, cpu); + + /* + * Just call it twice, hopefully the second call will be cache hot + * and a little more precise + */ + blk_check_time(t); + blk_check_time(t); + + put_cpu(); +} + +/* + * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU + * timings + */ +static void blk_trace_calibrate_offsets(void) +{ + unsigned long flags; + + smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1); + local_irq_save(flags); + blk_trace_check_cpu_time(NULL); + local_irq_restore(flags); +} + +static void blk_trace_set_ht_offsets(void) +{ +#if defined(CONFIG_SCHED_SMT) + int cpu, i; + + /* + * now make sure HT siblings have the same time offset + */ + preempt_disable(); + for_each_online_cpu(cpu) { + unsigned long long *cpu_off, *sibling_off; + + for_each_cpu_mask(i, cpu_sibling_map[cpu]) { + if (i == cpu) + continue; + + cpu_off = &per_cpu(blk_trace_cpu_offset, cpu); + sibling_off = &per_cpu(blk_trace_cpu_offset, i); + *sibling_off = *cpu_off; + } + } + preempt_enable(); +#endif +} + +static __init int blk_trace_init(void) +{ + mutex_init(&blk_tree_mutex); + blk_trace_calibrate_offsets(); + blk_trace_set_ht_offsets(); + + return 0; +} + +module_init(blk_trace_init); + diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c8dbe38c81c..67d446de022 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -6,21 +6,13 @@ * * Copyright (C) 2003 Jens Axboe <axboe@suse.de> */ -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/blkdev.h> -#include <linux/elevator.h> -#include <linux/bio.h> #include <linux/config.h> #include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/compiler.h> +#include <linux/blkdev.h> +#include <linux/elevator.h> #include <linux/hash.h> #include <linux/rbtree.h> -#include <linux/mempool.h> #include <linux/ioprio.h> -#include <linux/writeback.h> /* * tunables @@ -34,18 +26,14 @@ static const int cfq_back_penalty = 2; /* penalty of a backwards seek */ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; -static int cfq_slice_idle = HZ / 100; +static int cfq_slice_idle = HZ / 70; #define CFQ_IDLE_GRACE (HZ / 10) #define CFQ_SLICE_SCALE (5) #define CFQ_KEY_ASYNC (0) -#define CFQ_KEY_ANY (0xffff) -/* - * disable queueing at the driver/hardware level - */ -static const int cfq_max_depth = 2; +static DEFINE_RWLOCK(cfq_exit_lock); /* * for the hash of cfqq inside the cfqd @@ -89,6 +77,9 @@ static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static kmem_cache_t *cfq_ioc_pool; +static atomic_t ioc_count = ATOMIC_INIT(0); +static struct completion *ioc_gone; + #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) @@ -105,11 +96,12 @@ static kmem_cache_t *cfq_ioc_pool; #define cfq_cfqq_sync(cfqq) \ (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC]) +#define sample_valid(samples) ((samples) > 80) + /* * Per block device queue structure */ struct cfq_data { - atomic_t ref; request_queue_t *queue; /* @@ -174,7 +166,8 @@ struct cfq_data { unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; - unsigned int cfq_max_depth; + + struct list_head cic_list; }; /* @@ -288,7 +281,7 @@ CFQ_CRQ_FNS(is_sync); static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *); -static void cfq_put_cfqd(struct cfq_data *cfqd); +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); #define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE) @@ -345,17 +338,27 @@ static int cfq_queue_empty(request_queue_t *q) return !cfqd->busy_queues; } +static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) +{ + if (rw == READ || process_sync(task)) + return task->pid; + + return CFQ_KEY_ASYNC; +} + /* * Lifted from AS - choose which of crq1 and crq2 that is best served now. * We choose the request that is closest to the head right now. Distance - * behind the head are penalized and only allowed to a certain extent. + * behind the head is penalized and only allowed to a certain extent. */ static struct cfq_rq * cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) { sector_t last, s1, s2, d1 = 0, d2 = 0; - int r1_wrap = 0, r2_wrap = 0; /* requests are behind the disk head */ unsigned long back_max; +#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned wrap = 0; /* bit mask: requests behind the disk head? */ if (crq1 == NULL || crq1 == crq2) return crq2; @@ -387,35 +390,47 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) else if (s1 + back_max >= last) d1 = (last - s1) * cfqd->cfq_back_penalty; else - r1_wrap = 1; + wrap |= CFQ_RQ1_WRAP; if (s2 >= last) d2 = s2 - last; else if (s2 + back_max >= last) d2 = (last - s2) * cfqd->cfq_back_penalty; else - r2_wrap = 1; + wrap |= CFQ_RQ2_WRAP; /* Found required data */ - if (!r1_wrap && r2_wrap) - return crq1; - else if (!r2_wrap && r1_wrap) - return crq2; - else if (r1_wrap && r2_wrap) { - /* both behind the head */ - if (s1 <= s2) + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: crq1 and crq2 not wrapped */ + if (d1 < d2) return crq1; - else + else if (d2 < d1) return crq2; - } + else { + if (s1 >= s2) + return crq1; + else + return crq2; + } - /* Both requests in front of the head */ - if (d1 < d2) + case CFQ_RQ2_WRAP: return crq1; - else if (d2 < d1) + case CFQ_RQ1_WRAP: return crq2; - else { - if (s1 >= s2) + case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) return crq1; else return crq2; @@ -614,15 +629,20 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) cfq_add_crq_rb(crq); } -static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) - +static struct request * +cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) { - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY); + struct task_struct *tsk = current; + pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio)); + struct cfq_queue *cfqq; struct rb_node *n; + sector_t sector; + cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); if (!cfqq) goto out; + sector = bio->bi_sector + bio_sectors(bio); n = cfqq->sort_list.rb_node; while (n) { struct cfq_rq *crq = rb_entry_crq(n); @@ -676,7 +696,7 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) goto out; } - __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); + __rq = cfq_find_rq_fmerge(cfqd, bio); if (__rq && elv_rq_merge_ok(__rq, bio)) { ret = ELEVATOR_FRONT_MERGE; goto out; @@ -879,6 +899,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_io_context *cic; unsigned long sl; WARN_ON(!RB_EMPTY(&cfqq->sort_list)); @@ -894,13 +915,23 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * task has exited, don't wait */ - if (cfqd->active_cic && !cfqd->active_cic->ioc->task) + cic = cfqd->active_cic; + if (!cic || !cic->ioc->task) return 0; cfq_mark_cfqq_must_dispatch(cfqq); cfq_mark_cfqq_wait_request(cfqq); sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle); + + /* + * we don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. so allow a little bit of time for him to submit a new rq + */ + if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072) + sl = 2; + mod_timer(&cfqd->idle_slice_timer, jiffies + sl); return 1; } @@ -1117,13 +1148,6 @@ cfq_dispatch_requests(request_queue_t *q, int force) if (cfqq) { int max_dispatch; - /* - * if idle window is disabled, allow queue buildup - */ - if (!cfq_cfqq_idle_window(cfqq) && - cfqd->rq_in_driver >= cfqd->cfq_max_depth) - return 0; - cfq_clear_cfqq_must_dispatch(cfqq); cfq_clear_cfqq_wait_request(cfqq); del_timer(&cfqd->idle_slice_timer); @@ -1160,8 +1184,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) if (unlikely(cfqd->active_queue == cfqq)) __cfq_slice_expired(cfqd, cfqq, 0); - cfq_put_cfqd(cfqq->cfqd); - /* * it's on the empty list and still hashed */ @@ -1175,13 +1197,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, const int hashval) { struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; - struct hlist_node *entry, *next; + struct hlist_node *entry; + struct cfq_queue *__cfqq; - hlist_for_each_safe(entry, next, hash_list) { - struct cfq_queue *__cfqq = list_entry_qhash(entry); - const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio); + hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) { + const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio); - if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY)) + if (__cfqq->key == key && (__p == prio || !prio)) return __cfqq; } @@ -1194,17 +1216,27 @@ cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); } -static void cfq_free_io_context(struct cfq_io_context *cic) +static void cfq_free_io_context(struct io_context *ioc) { struct cfq_io_context *__cic; - struct list_head *entry, *next; + struct rb_node *n; + int freed = 0; - list_for_each_safe(entry, next, &cic->list) { - __cic = list_entry(entry, struct cfq_io_context, list); + while ((n = rb_first(&ioc->cic_root)) != NULL) { + __cic = rb_entry(n, struct cfq_io_context, rb_node); + rb_erase(&__cic->rb_node, &ioc->cic_root); kmem_cache_free(cfq_ioc_pool, __cic); + freed++; } - kmem_cache_free(cfq_ioc_pool, cic); + if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone) + complete(ioc_gone); +} + +static void cfq_trim(struct io_context *ioc) +{ + ioc->set_ioprio = NULL; + cfq_free_io_context(ioc); } /* @@ -1212,43 +1244,57 @@ static void cfq_free_io_context(struct cfq_io_context *cic) */ static void cfq_exit_single_io_context(struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->cfqq->cfqd; - request_queue_t *q = cfqd->queue; + struct cfq_data *cfqd = cic->key; + request_queue_t *q; + + if (!cfqd) + return; + + q = cfqd->queue; WARN_ON(!irqs_disabled()); spin_lock(q->queue_lock); - if (unlikely(cic->cfqq == cfqd->active_queue)) - __cfq_slice_expired(cfqd, cic->cfqq, 0); + if (cic->cfqq[ASYNC]) { + if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue)) + __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0); + cfq_put_queue(cic->cfqq[ASYNC]); + cic->cfqq[ASYNC] = NULL; + } - cfq_put_queue(cic->cfqq); - cic->cfqq = NULL; + if (cic->cfqq[SYNC]) { + if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue)) + __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0); + cfq_put_queue(cic->cfqq[SYNC]); + cic->cfqq[SYNC] = NULL; + } + + cic->key = NULL; + list_del_init(&cic->queue_list); spin_unlock(q->queue_lock); } -/* - * Another task may update the task cic list, if it is doing a queue lookup - * on its behalf. cfq_cic_lock excludes such concurrent updates - */ -static void cfq_exit_io_context(struct cfq_io_context *cic) +static void cfq_exit_io_context(struct io_context *ioc) { struct cfq_io_context *__cic; - struct list_head *entry; unsigned long flags; - - local_irq_save(flags); + struct rb_node *n; /* * put the reference this task is holding to the various queues */ - list_for_each(entry, &cic->list) { - __cic = list_entry(entry, struct cfq_io_context, list); + read_lock_irqsave(&cfq_exit_lock, flags); + + n = rb_first(&ioc->cic_root); + while (n != NULL) { + __cic = rb_entry(n, struct cfq_io_context, rb_node); + cfq_exit_single_io_context(__cic); + n = rb_next(n); } - cfq_exit_single_io_context(cic); - local_irq_restore(flags); + read_unlock_irqrestore(&cfq_exit_lock, flags); } static struct cfq_io_context * @@ -1257,15 +1303,18 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); if (cic) { - INIT_LIST_HEAD(&cic->list); - cic->cfqq = NULL; + RB_CLEAR(&cic->rb_node); cic->key = NULL; + cic->cfqq[ASYNC] = NULL; + cic->cfqq[SYNC] = NULL; cic->last_end_request = jiffies; cic->ttime_total = 0; cic->ttime_samples = 0; cic->ttime_mean = 0; cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; + INIT_LIST_HEAD(&cic->queue_list); + atomic_inc(&ioc_count); } return cic; @@ -1318,14 +1367,27 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq) cfq_clear_cfqq_prio_changed(cfqq); } -static inline void changed_ioprio(struct cfq_queue *cfqq) +static inline void changed_ioprio(struct cfq_io_context *cic) { - if (cfqq) { - struct cfq_data *cfqd = cfqq->cfqd; - + struct cfq_data *cfqd = cic->key; + struct cfq_queue *cfqq; + if (cfqd) { spin_lock(cfqd->queue->queue_lock); - cfq_mark_cfqq_prio_changed(cfqq); - cfq_init_prio_data(cfqq); + cfqq = cic->cfqq[ASYNC]; + if (cfqq) { + struct cfq_queue *new_cfqq; + new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, + cic->ioc->task, GFP_ATOMIC); + if (new_cfqq) { + cic->cfqq[ASYNC] = new_cfqq; + cfq_put_queue(cfqq); + } + } + cfqq = cic->cfqq[SYNC]; + if (cfqq) { + cfq_mark_cfqq_prio_changed(cfqq); + cfq_init_prio_data(cfqq); + } spin_unlock(cfqd->queue->queue_lock); } } @@ -1335,24 +1397,34 @@ static inline void changed_ioprio(struct cfq_queue *cfqq) */ static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) { - struct cfq_io_context *cic = ioc->cic; + struct cfq_io_context *cic; + struct rb_node *n; - changed_ioprio(cic->cfqq); + write_lock(&cfq_exit_lock); - list_for_each_entry(cic, &cic->list, list) - changed_ioprio(cic->cfqq); + n = rb_first(&ioc->cic_root); + while (n != NULL) { + cic = rb_entry(n, struct cfq_io_context, rb_node); + + changed_ioprio(cic); + n = rb_next(n); + } + + write_unlock(&cfq_exit_lock); return 0; } static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio, +cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask) { const int hashval = hash_long(key, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; + unsigned short ioprio; retry: + ioprio = tsk->ioprio; cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); if (!cfqq) { @@ -1381,7 +1453,6 @@ retry: hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); atomic_set(&cfqq->ref, 0); cfqq->cfqd = cfqd; - atomic_inc(&cfqd->ref); cfqq->service_last = 0; /* * set ->slice_left to allow preemption for a new process @@ -1401,14 +1472,67 @@ out: return cfqq; } +static struct cfq_io_context * +cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) +{ + struct rb_node *n = ioc->cic_root.rb_node; + struct cfq_io_context *cic; + void *key = cfqd; + + while (n) { + cic = rb_entry(n, struct cfq_io_context, rb_node); + + if (key < cic->key) + n = n->rb_left; + else if (key > cic->key) + n = n->rb_right; + else + return cic; + } + + return NULL; +} + +static inline void +cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, + struct cfq_io_context *cic) +{ + struct rb_node **p = &ioc->cic_root.rb_node; + struct rb_node *parent = NULL; + struct cfq_io_context *__cic; + + read_lock(&cfq_exit_lock); + + cic->ioc = ioc; + cic->key = cfqd; + + ioc->set_ioprio = cfq_ioc_set_ioprio; + + while (*p) { + parent = *p; + __cic = rb_entry(parent, struct cfq_io_context, rb_node); + + if (cic->key < __cic->key) + p = &(*p)->rb_left; + else if (cic->key > __cic->key) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cic->rb_node, parent, p); + rb_insert_color(&cic->rb_node, &ioc->cic_root); + list_add(&cic->queue_list, &cfqd->cic_list); + read_unlock(&cfq_exit_lock); +} + /* * Setup general io context and cfq io context. There can be several cfq * io contexts per general io context, if this process is doing io to more - * than one device managed by cfq. Note that caller is holding a reference to - * cfqq, so we don't need to worry about it disappearing + * than one device managed by cfq. */ static struct cfq_io_context * -cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask) +cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { struct io_context *ioc = NULL; struct cfq_io_context *cic; @@ -1419,61 +1543,15 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask) if (!ioc) return NULL; - if ((cic = ioc->cic) == NULL) { - cic = cfq_alloc_io_context(cfqd, gfp_mask); - - if (cic == NULL) - goto err; - - /* - * manually increment generic io_context usage count, it - * cannot go away since we are already holding one ref to it - */ - ioc->cic = cic; - ioc->set_ioprio = cfq_ioc_set_ioprio; - cic->ioc = ioc; - cic->key = cfqd; - atomic_inc(&cfqd->ref); - } else { - struct cfq_io_context *__cic; - - /* - * the first cic on the list is actually the head itself - */ - if (cic->key == cfqd) - goto out; - - /* - * cic exists, check if we already are there. linear search - * should be ok here, the list will usually not be more than - * 1 or a few entries long - */ - list_for_each_entry(__cic, &cic->list, list) { - /* - * this process is already holding a reference to - * this queue, so no need to get one more - */ - if (__cic->key == cfqd) { - cic = __cic; - goto out; - } - } + cic = cfq_cic_rb_lookup(cfqd, ioc); + if (cic) + goto out; - /* - * nope, process doesn't have a cic assoicated with this - * cfqq yet. get a new one and add to list - */ - __cic = cfq_alloc_io_context(cfqd, gfp_mask); - if (__cic == NULL) - goto err; - - __cic->ioc = ioc; - __cic->key = cfqd; - atomic_inc(&cfqd->ref); - list_add(&__cic->list, &cic->list); - cic = __cic; - } + cic = cfq_alloc_io_context(cfqd, gfp_mask); + if (cic == NULL) + goto err; + cfq_cic_link(cfqd, ioc, cic); out: return cic; err: @@ -1506,7 +1584,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; } -#define sample_valid(samples) ((samples) > 80) +static void +cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, + struct cfq_rq *crq) +{ + sector_t sdist; + u64 total; + + if (cic->last_request_pos < crq->request->sector) + sdist = crq->request->sector - cic->last_request_pos; + else + sdist = cic->last_request_pos - crq->request->sector; + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc + */ + if (cic->seek_samples <= 60) /* second&third seek */ + sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); + + cic->seek_samples = (7*cic->seek_samples + 256) / 8; + cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; + total = cic->seek_total + (cic->seek_samples/2); + do_div(total, cic->seek_samples); + cic->seek_mean = (sector_t)total; +} /* * Disable idle window if the process thinks too long or seeks so much that @@ -1619,9 +1723,11 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cic = crq->io_context; cfq_update_io_thinktime(cfqd, cic); + cfq_update_io_seektime(cfqd, cic, crq); cfq_update_idle_window(cfqd, cfqq, cic); cic->last_queue = jiffies; + cic->last_request_pos = crq->request->sector + crq->request->nr_sectors; if (cfqq == cfqd->active_queue) { /* @@ -1754,14 +1860,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) cfq_resort_rr_list(cfqq, 0); } -static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) -{ - if (rw == READ || process_sync(task)) - return task->pid; - - return CFQ_KEY_ASYNC; -} - static inline int __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct task_struct *task, int rw) @@ -1890,24 +1988,25 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, struct cfq_queue *cfqq; struct cfq_rq *crq; unsigned long flags; + int is_sync = key != CFQ_KEY_ASYNC; might_sleep_if(gfp_mask & __GFP_WAIT); - cic = cfq_get_io_context(cfqd, key, gfp_mask); + cic = cfq_get_io_context(cfqd, gfp_mask); spin_lock_irqsave(q->queue_lock, flags); if (!cic) goto queue_fail; - if (!cic->cfqq) { - cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask); + if (!cic->cfqq[is_sync]) { + cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); if (!cfqq) goto queue_fail; - cic->cfqq = cfqq; + cic->cfqq[is_sync] = cfqq; } else - cfqq = cic->cfqq; + cfqq = cic->cfqq[is_sync]; cfqq->allocated[rw]++; cfq_clear_cfqq_must_alloc(cfqq); @@ -1924,7 +2023,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, crq->cfq_queue = cfqq; crq->io_context = cic; - if (rw == READ || process_sync(tsk)) + if (is_sync) cfq_mark_crq_is_sync(crq); else cfq_clear_crq_is_sync(crq); @@ -2055,15 +2154,39 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) blk_sync_queue(cfqd->queue); } -static void cfq_put_cfqd(struct cfq_data *cfqd) +static void cfq_exit_queue(elevator_t *e) { + struct cfq_data *cfqd = e->elevator_data; request_queue_t *q = cfqd->queue; - if (!atomic_dec_and_test(&cfqd->ref)) - return; + cfq_shutdown_timer_wq(cfqd); + + write_lock(&cfq_exit_lock); + spin_lock_irq(q->queue_lock); + + if (cfqd->active_queue) + __cfq_slice_expired(cfqd, cfqd->active_queue, 0); + + while (!list_empty(&cfqd->cic_list)) { + struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, + struct cfq_io_context, + queue_list); + if (cic->cfqq[ASYNC]) { + cfq_put_queue(cic->cfqq[ASYNC]); + cic->cfqq[ASYNC] = NULL; + } + if (cic->cfqq[SYNC]) { + cfq_put_queue(cic->cfqq[SYNC]); + cic->cfqq[SYNC] = NULL; + } + cic->key = NULL; + list_del_init(&cic->queue_list); + } + + spin_unlock_irq(q->queue_lock); + write_unlock(&cfq_exit_lock); cfq_shutdown_timer_wq(cfqd); - blk_put_queue(q); mempool_destroy(cfqd->crq_pool); kfree(cfqd->crq_hash); @@ -2071,14 +2194,6 @@ static void cfq_put_cfqd(struct cfq_data *cfqd) kfree(cfqd); } -static void cfq_exit_queue(elevator_t *e) -{ - struct cfq_data *cfqd = e->elevator_data; - - cfq_shutdown_timer_wq(cfqd); - cfq_put_cfqd(cfqd); -} - static int cfq_init_queue(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; @@ -2097,6 +2212,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) INIT_LIST_HEAD(&cfqd->cur_rr); INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->empty_list); + INIT_LIST_HEAD(&cfqd->cic_list); cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); if (!cfqd->crq_hash) @@ -2106,7 +2222,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) if (!cfqd->cfq_hash) goto out_cfqhash; - cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool); + cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool); if (!cfqd->crq_pool) goto out_crqpool; @@ -2118,7 +2234,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) e->elevator_data = cfqd; cfqd->queue = q; - atomic_inc(&q->refcnt); cfqd->max_queued = q->nr_requests / 4; q->nr_batching = cfq_queued; @@ -2133,8 +2248,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); - atomic_set(&cfqd->ref, 1); - cfqd->cfq_queued = cfq_queued; cfqd->cfq_quantum = cfq_quantum; cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; @@ -2145,7 +2258,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; - cfqd->cfq_max_depth = cfq_max_depth; return 0; out_crqpool: @@ -2193,11 +2305,6 @@ fail: /* * sysfs parts below --> */ -struct cfq_fs_entry { - struct attribute attr; - ssize_t (*show)(struct cfq_data *, char *); - ssize_t (*store)(struct cfq_data *, const char *, size_t); -}; static ssize_t cfq_var_show(unsigned int var, char *page) @@ -2215,8 +2322,9 @@ cfq_var_store(unsigned int *var, const char *page, size_t count) } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ +static ssize_t __FUNC(elevator_t *e, char *page) \ { \ + struct cfq_data *cfqd = e->elevator_data; \ unsigned int __data = __VAR; \ if (__CONV) \ __data = jiffies_to_msecs(__data); \ @@ -2226,18 +2334,18 @@ SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); -SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0); -SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0); +SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); +SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); -SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ +static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ { \ + struct cfq_data *cfqd = e->elevator_data; \ unsigned int __data; \ int ret = cfq_var_store(&__data, (page), count); \ if (__data < (MIN)) \ @@ -2254,121 +2362,29 @@ STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); -STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); +STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); -STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0); #undef STORE_FUNCTION -static struct cfq_fs_entry cfq_quantum_entry = { - .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_quantum_show, - .store = cfq_quantum_store, -}; -static struct cfq_fs_entry cfq_queued_entry = { - .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_queued_show, - .store = cfq_queued_store, -}; -static struct cfq_fs_entry cfq_fifo_expire_sync_entry = { - .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_expire_sync_show, - .store = cfq_fifo_expire_sync_store, -}; -static struct cfq_fs_entry cfq_fifo_expire_async_entry = { - .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_expire_async_show, - .store = cfq_fifo_expire_async_store, -}; -static struct cfq_fs_entry cfq_back_max_entry = { - .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_back_max_show, - .store = cfq_back_max_store, -}; -static struct cfq_fs_entry cfq_back_penalty_entry = { - .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_back_penalty_show, - .store = cfq_back_penalty_store, -}; -static struct cfq_fs_entry cfq_slice_sync_entry = { - .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_slice_sync_show, - .store = cfq_slice_sync_store, -}; -static struct cfq_fs_entry cfq_slice_async_entry = { - .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_slice_async_show, - .store = cfq_slice_async_store, -}; -static struct cfq_fs_entry cfq_slice_async_rq_entry = { - .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_slice_async_rq_show, - .store = cfq_slice_async_rq_store, -}; -static struct cfq_fs_entry cfq_slice_idle_entry = { - .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_slice_idle_show, - .store = cfq_slice_idle_store, -}; -static struct cfq_fs_entry cfq_max_depth_entry = { - .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_max_depth_show, - .store = cfq_max_depth_store, -}; - -static struct attribute *default_attrs[] = { - &cfq_quantum_entry.attr, - &cfq_queued_entry.attr, - &cfq_fifo_expire_sync_entry.attr, - &cfq_fifo_expire_async_entry.attr, - &cfq_back_max_entry.attr, - &cfq_back_penalty_entry.attr, - &cfq_slice_sync_entry.attr, - &cfq_slice_async_entry.attr, - &cfq_slice_async_rq_entry.attr, - &cfq_slice_idle_entry.attr, - &cfq_max_depth_entry.attr, - NULL, -}; - -#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr) - -static ssize_t -cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - elevator_t *e = container_of(kobj, elevator_t, kobj); - struct cfq_fs_entry *entry = to_cfq(attr); - - if (!entry->show) - return -EIO; - - return entry->show(e->elevator_data, page); -} - -static ssize_t -cfq_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - elevator_t *e = container_of(kobj, elevator_t, kobj); - struct cfq_fs_entry *entry = to_cfq(attr); - - if (!entry->store) - return -EIO; - - return entry->store(e->elevator_data, page, length); -} - -static struct sysfs_ops cfq_sysfs_ops = { - .show = cfq_attr_show, - .store = cfq_attr_store, -}; - -static struct kobj_type cfq_ktype = { - .sysfs_ops = &cfq_sysfs_ops, - .default_attrs = default_attrs, +#define CFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) + +static struct elv_fs_entry cfq_attrs[] = { + CFQ_ATTR(quantum), + CFQ_ATTR(queued), + CFQ_ATTR(fifo_expire_sync), + CFQ_ATTR(fifo_expire_async), + CFQ_ATTR(back_seek_max), + CFQ_ATTR(back_seek_penalty), + CFQ_ATTR(slice_sync), + CFQ_ATTR(slice_async), + CFQ_ATTR(slice_async_rq), + CFQ_ATTR(slice_idle), + __ATTR_NULL }; static struct elevator_type iosched_cfq = { @@ -2389,8 +2405,9 @@ static struct elevator_type iosched_cfq = { .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, + .trim = cfq_trim, }, - .elevator_ktype = &cfq_ktype, + .elevator_attrs = cfq_attrs, .elevator_name = "cfq", .elevator_owner = THIS_MODULE, }; @@ -2419,7 +2436,13 @@ static int __init cfq_init(void) static void __exit cfq_exit(void) { + DECLARE_COMPLETION(all_gone); elv_unregister(&iosched_cfq); + ioc_gone = &all_gone; + barrier(); + if (atomic_read(&ioc_count)) + complete(ioc_gone); + synchronize_rcu(); cfq_slab_kill(); } diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 27e494b1bf9..399fa1e60e1 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -694,11 +694,6 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio, /* * sysfs parts below */ -struct deadline_fs_entry { - struct attribute attr; - ssize_t (*show)(struct deadline_data *, char *); - ssize_t (*store)(struct deadline_data *, const char *, size_t); -}; static ssize_t deadline_var_show(int var, char *page) @@ -716,23 +711,25 @@ deadline_var_store(int *var, const char *page, size_t count) } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct deadline_data *dd, char *page) \ +static ssize_t __FUNC(elevator_t *e, char *page) \ { \ - int __data = __VAR; \ + struct deadline_data *dd = e->elevator_data; \ + int __data = __VAR; \ if (__CONV) \ __data = jiffies_to_msecs(__data); \ return deadline_var_show(__data, (page)); \ } -SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1); -SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0); +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count) \ +static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ { \ + struct deadline_data *dd = e->elevator_data; \ int __data; \ int ret = deadline_var_store(&__data, (page), count); \ if (__data < (MIN)) \ @@ -745,83 +742,24 @@ static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count) *(__PTR) = __data; \ return ret; \ } -STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0); +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); #undef STORE_FUNCTION -static struct deadline_fs_entry deadline_readexpire_entry = { - .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, - .show = deadline_readexpire_show, - .store = deadline_readexpire_store, -}; -static struct deadline_fs_entry deadline_writeexpire_entry = { - .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, - .show = deadline_writeexpire_show, - .store = deadline_writeexpire_store, -}; -static struct deadline_fs_entry deadline_writesstarved_entry = { - .attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR }, - .show = deadline_writesstarved_show, - .store = deadline_writesstarved_store, -}; -static struct deadline_fs_entry deadline_frontmerges_entry = { - .attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR }, - .show = deadline_frontmerges_show, - .store = deadline_frontmerges_store, -}; -static struct deadline_fs_entry deadline_fifobatch_entry = { - .attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR }, - .show = deadline_fifobatch_show, - .store = deadline_fifobatch_store, -}; - -static struct attribute *default_attrs[] = { - &deadline_readexpire_entry.attr, - &deadline_writeexpire_entry.attr, - &deadline_writesstarved_entry.attr, - &deadline_frontmerges_entry.attr, - &deadline_fifobatch_entry.attr, - NULL, -}; - -#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr) - -static ssize_t -deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - elevator_t *e = container_of(kobj, elevator_t, kobj); - struct deadline_fs_entry *entry = to_deadline(attr); - - if (!entry->show) - return -EIO; - - return entry->show(e->elevator_data, page); -} - -static ssize_t -deadline_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - elevator_t *e = container_of(kobj, elevator_t, kobj); - struct deadline_fs_entry *entry = to_deadline(attr); - - if (!entry->store) - return -EIO; - - return entry->store(e->elevator_data, page, length); -} - -static struct sysfs_ops deadline_sysfs_ops = { - .show = deadline_attr_show, - .store = deadline_attr_store, -}; - -static struct kobj_type deadline_ktype = { - .sysfs_ops = &deadline_sysfs_ops, - .default_attrs = default_attrs, +#define DD_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ + deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(fifo_batch), + __ATTR_NULL }; static struct elevator_type iosched_deadline = { @@ -840,7 +778,7 @@ static struct elevator_type iosched_deadline = { .elevator_exit_fn = deadline_exit_queue, }, - .elevator_ktype = &deadline_ktype, + .elevator_attrs = deadline_attrs, .elevator_name = "deadline", .elevator_owner = THIS_MODULE, }; diff --git a/block/elevator.c b/block/elevator.c index 24b702d649a..0d6be03d929 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -33,6 +33,7 @@ #include <linux/init.h> #include <linux/compiler.h> #include <linux/delay.h> +#include <linux/blktrace_api.h> #include <asm/uaccess.h> @@ -120,15 +121,10 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static int elevator_attach(request_queue_t *q, struct elevator_type *e, - struct elevator_queue *eq) +static int elevator_attach(request_queue_t *q, struct elevator_queue *eq) { int ret = 0; - memset(eq, 0, sizeof(*eq)); - eq->ops = &e->ops; - eq->elevator_type = e; - q->elevator = eq; if (eq->ops->elevator_init_fn) @@ -149,11 +145,37 @@ static int __init elevator_setup(char *str) strcpy(chosen_elevator, "anticipatory"); else strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); - return 0; + return 1; } __setup("elevator=", elevator_setup); +static struct kobj_type elv_ktype; + +static elevator_t *elevator_alloc(struct elevator_type *e) +{ + elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL); + if (eq) { + memset(eq, 0, sizeof(*eq)); + eq->ops = &e->ops; + eq->elevator_type = e; + kobject_init(&eq->kobj); + snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched"); + eq->kobj.ktype = &elv_ktype; + mutex_init(&eq->sysfs_lock); + } else { + elevator_put(e); + } + return eq; +} + +static void elevator_release(struct kobject *kobj) +{ + elevator_t *e = container_of(kobj, elevator_t, kobj); + elevator_put(e->elevator_type); + kfree(e); +} + int elevator_init(request_queue_t *q, char *name) { struct elevator_type *e = NULL; @@ -176,29 +198,26 @@ int elevator_init(request_queue_t *q, char *name) e = elevator_get("noop"); } - eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL); - if (!eq) { - elevator_put(e); + eq = elevator_alloc(e); + if (!eq) return -ENOMEM; - } - ret = elevator_attach(q, e, eq); - if (ret) { - kfree(eq); - elevator_put(e); - } + ret = elevator_attach(q, eq); + if (ret) + kobject_put(&eq->kobj); return ret; } void elevator_exit(elevator_t *e) { + mutex_lock(&e->sysfs_lock); if (e->ops->elevator_exit_fn) e->ops->elevator_exit_fn(e); + e->ops = NULL; + mutex_unlock(&e->sysfs_lock); - elevator_put(e->elevator_type); - e->elevator_type = NULL; - kfree(e); + kobject_put(&e->kobj); } /* @@ -315,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where) struct list_head *pos; unsigned ordseq; + blk_add_trace_rq(q, rq, BLK_TA_INSERT); + rq->q = q; switch (where) { @@ -481,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q) * not be passed by new incoming requests */ rq->flags |= REQ_STARTED; + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); } if (!q->boundary_rq || q->boundary_rq == rq) { @@ -627,34 +649,86 @@ void elv_completed_request(request_queue_t *q, struct request *rq) } } -int elv_register_queue(struct request_queue *q) +#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) + +static ssize_t +elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - elevator_t *e = q->elevator; + elevator_t *e = container_of(kobj, elevator_t, kobj); + struct elv_fs_entry *entry = to_elv(attr); + ssize_t error; + + if (!entry->show) + return -EIO; + + mutex_lock(&e->sysfs_lock); + error = e->ops ? entry->show(e, page) : -ENOENT; + mutex_unlock(&e->sysfs_lock); + return error; +} - e->kobj.parent = kobject_get(&q->kobj); - if (!e->kobj.parent) - return -EBUSY; +static ssize_t +elv_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + elevator_t *e = container_of(kobj, elevator_t, kobj); + struct elv_fs_entry *entry = to_elv(attr); + ssize_t error; - snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched"); - e->kobj.ktype = e->elevator_type->elevator_ktype; + if (!entry->store) + return -EIO; - return kobject_register(&e->kobj); + mutex_lock(&e->sysfs_lock); + error = e->ops ? entry->store(e, page, length) : -ENOENT; + mutex_unlock(&e->sysfs_lock); + return error; +} + +static struct sysfs_ops elv_sysfs_ops = { + .show = elv_attr_show, + .store = elv_attr_store, +}; + +static struct kobj_type elv_ktype = { + .sysfs_ops = &elv_sysfs_ops, + .release = elevator_release, +}; + +int elv_register_queue(struct request_queue *q) +{ + elevator_t *e = q->elevator; + int error; + + e->kobj.parent = &q->kobj; + + error = kobject_add(&e->kobj); + if (!error) { + struct elv_fs_entry *attr = e->elevator_type->elevator_attrs; + if (attr) { + while (attr->attr.name) { + if (sysfs_create_file(&e->kobj, &attr->attr)) + break; + attr++; + } + } + kobject_uevent(&e->kobj, KOBJ_ADD); + } + return error; } void elv_unregister_queue(struct request_queue *q) { if (q) { elevator_t *e = q->elevator; - kobject_unregister(&e->kobj); - kobject_put(&q->kobj); + kobject_uevent(&e->kobj, KOBJ_REMOVE); + kobject_del(&e->kobj); } } int elv_register(struct elevator_type *e) { spin_lock_irq(&elv_list_lock); - if (elevator_find(e->elevator_name)) - BUG(); + BUG_ON(elevator_find(e->elevator_name)); list_add_tail(&e->list, &elv_list); spin_unlock_irq(&elv_list_lock); @@ -675,21 +749,15 @@ void elv_unregister(struct elevator_type *e) /* * Iterate every thread in the process to remove the io contexts. */ - read_lock(&tasklist_lock); - do_each_thread(g, p) { - struct io_context *ioc = p->io_context; - if (ioc && ioc->cic) { - ioc->cic->exit(ioc->cic); - ioc->cic->dtor(ioc->cic); - ioc->cic = NULL; - } - if (ioc && ioc->aic) { - ioc->aic->exit(ioc->aic); - ioc->aic->dtor(ioc->aic); - ioc->aic = NULL; - } - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (e->ops.trim) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + e->ops.trim(p->io_context); + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } spin_lock_irq(&elv_list_lock); list_del_init(&e->list); @@ -703,16 +771,16 @@ EXPORT_SYMBOL_GPL(elv_unregister); * need for the new one. this way we have a chance of going back to the old * one, if the new one fails init for some reason. */ -static void elevator_switch(request_queue_t *q, struct elevator_type *new_e) +static int elevator_switch(request_queue_t *q, struct elevator_type *new_e) { elevator_t *old_elevator, *e; /* * Allocate new elevator */ - e = kmalloc(sizeof(elevator_t), GFP_KERNEL); + e = elevator_alloc(new_e); if (!e) - goto error; + return 0; /* * Turn on BYPASS and drain all requests w/ elevator private data @@ -743,7 +811,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e) /* * attach and start new elevator */ - if (elevator_attach(q, new_e, e)) + if (elevator_attach(q, e)) goto fail; if (elv_register_queue(q)) @@ -754,7 +822,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e) */ elevator_exit(old_elevator); clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - return; + return 1; fail_register: /* @@ -767,10 +835,9 @@ fail: q->elevator = old_elevator; elv_register_queue(q); clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - kfree(e); -error: - elevator_put(new_e); - printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name); + if (e) + kobject_put(&e->kobj); + return 0; } ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count) @@ -797,7 +864,8 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count) return count; } - elevator_switch(q, e); + if (!elevator_switch(q, e)) + printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name); return count; } diff --git a/block/genhd.c b/block/genhd.c index db57546a709..5a8d3bf02f1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -15,12 +15,11 @@ #include <linux/kmod.h> #include <linux/kobj_map.h> #include <linux/buffer_head.h> - -#define MAX_PROBE_HASH 255 /* random */ +#include <linux/mutex.h> static struct subsystem block_subsys; -static DECLARE_MUTEX(block_subsys_sem); +static DEFINE_MUTEX(block_subsys_lock); /* * Can be deleted altogether. Later. @@ -30,115 +29,36 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; -} *major_names[MAX_PROBE_HASH]; +} *major_names[BLKDEV_MAJOR_HASH_SIZE]; /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(int major) { - return major % MAX_PROBE_HASH; -} - -struct blkdev_info { - int index; - struct blk_major_name *bd; -}; - -/* - * iterate over a list of blkdev_info structures. allows - * the major_names array to be iterated over from outside this file - * must be called with the block_subsys_sem held - */ -void *get_next_blkdev(void *dev) -{ - struct blkdev_info *info; - - if (dev == NULL) { - info = kmalloc(sizeof(*info), GFP_KERNEL); - if (!info) - goto out; - info->index=0; - info->bd = major_names[info->index]; - if (info->bd) - goto out; - } else { - info = dev; - } - - while (info->index < ARRAY_SIZE(major_names)) { - if (info->bd) - info->bd = info->bd->next; - if (info->bd) - goto out; - /* - * No devices on this chain, move to the next - */ - info->index++; - info->bd = (info->index < ARRAY_SIZE(major_names)) ? - major_names[info->index] : NULL; - if (info->bd) - goto out; - } - -out: - return info; -} - -void *acquire_blkdev_list(void) -{ - down(&block_subsys_sem); - return get_next_blkdev(NULL); -} - -void release_blkdev_list(void *dev) -{ - up(&block_subsys_sem); - kfree(dev); + return major % BLKDEV_MAJOR_HASH_SIZE; } +#ifdef CONFIG_PROC_FS -/* - * Count the number of records in the blkdev_list. - * must be called with the block_subsys_sem held - */ -int count_blkdev_list(void) +void blkdev_show(struct seq_file *f, off_t offset) { - struct blk_major_name *n; - int i, count; + struct blk_major_name *dp; - count = 0; - - for (i = 0; i < ARRAY_SIZE(major_names); i++) { - for (n = major_names[i]; n; n = n->next) - count++; + if (offset < BLKDEV_MAJOR_HASH_SIZE) { + mutex_lock(&block_subsys_lock); + for (dp = major_names[offset]; dp; dp = dp->next) + seq_printf(f, "%3d %s\n", dp->major, dp->name); + mutex_unlock(&block_subsys_lock); } - - return count; -} - -/* - * extract the major and name values from a blkdev_info struct - * passed in as a void to *dev. Must be called with - * block_subsys_sem held - */ -int get_blkdev_info(void *dev, int *major, char **name) -{ - struct blkdev_info *info = dev; - - if (info->bd == NULL) - return 1; - - *major = info->bd->major; - *name = info->bd->name; - return 0; } +#endif /* CONFIG_PROC_FS */ int register_blkdev(unsigned int major, const char *name) { struct blk_major_name **n, *p; int index, ret = 0; - down(&block_subsys_sem); + mutex_lock(&block_subsys_lock); /* temporary */ if (major == 0) { @@ -183,7 +103,7 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - up(&block_subsys_sem); + mutex_unlock(&block_subsys_lock); return ret; } @@ -197,7 +117,7 @@ int unregister_blkdev(unsigned int major, const char *name) int index = major_to_index(major); int ret = 0; - down(&block_subsys_sem); + mutex_lock(&block_subsys_lock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -207,7 +127,7 @@ int unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } - up(&block_subsys_sem); + mutex_unlock(&block_subsys_lock); kfree(p); return ret; @@ -301,7 +221,7 @@ static void *part_start(struct seq_file *part, loff_t *pos) struct list_head *p; loff_t l = *pos; - down(&block_subsys_sem); + mutex_lock(&block_subsys_lock); list_for_each(p, &block_subsys.kset.list) if (!l--) return list_entry(p, struct gendisk, kobj.entry); @@ -318,7 +238,7 @@ static void *part_next(struct seq_file *part, void *v, loff_t *pos) static void part_stop(struct seq_file *part, void *v) { - up(&block_subsys_sem); + mutex_unlock(&block_subsys_lock); } static int show_partition(struct seq_file *part, void *v) @@ -377,7 +297,7 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data) static int __init genhd_device_init(void) { - bdev_map = kobj_map_init(base_probe, &block_subsys_sem); + bdev_map = kobj_map_init(base_probe, &block_subsys_lock); blk_dev_init(); subsystem_register(&block_subsys); return 0; @@ -453,8 +373,8 @@ static ssize_t disk_stats_read(struct gendisk * disk, char *page) disk_round_stats(disk); preempt_enable(); return sprintf(page, - "%8u %8u %8llu %8u " - "%8u %8u %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " "%8u %8u %8u" "\n", disk_stat_read(disk, ios[READ]), @@ -611,7 +531,7 @@ static void *diskstats_start(struct seq_file *part, loff_t *pos) loff_t k = *pos; struct list_head *p; - down(&block_subsys_sem); + mutex_lock(&block_subsys_lock); list_for_each(p, &block_subsys.kset.list) if (!k--) return list_entry(p, struct gendisk, kobj.entry); @@ -628,7 +548,7 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos) static void diskstats_stop(struct seq_file *part, void *v) { - up(&block_subsys_sem); + mutex_unlock(&block_subsys_lock); } static int diskstats_show(struct seq_file *s, void *v) @@ -648,7 +568,7 @@ static int diskstats_show(struct seq_file *s, void *v) preempt_disable(); disk_round_stats(gp); preempt_enable(); - seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n", + seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", gp->major, n + gp->first_minor, disk_name(gp, n, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), (unsigned long long)disk_stat_read(gp, sectors[0]), diff --git a/block/ioctl.c b/block/ioctl.c index e1109491c23..9cfa2e1ecb2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -5,6 +5,7 @@ #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/smp_lock.h> +#include <linux/blktrace_api.h> #include <asm/uaccess.h> static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) @@ -42,9 +43,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user return -EINVAL; } /* partition number in use? */ - down(&bdev->bd_sem); + mutex_lock(&bdev->bd_mutex); if (disk->part[part - 1]) { - up(&bdev->bd_sem); + mutex_unlock(&bdev->bd_mutex); return -EBUSY; } /* overlap? */ @@ -55,13 +56,13 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user continue; if (!(start+length <= s->start_sect || start >= s->start_sect + s->nr_sects)) { - up(&bdev->bd_sem); + mutex_unlock(&bdev->bd_mutex); return -EBUSY; } } /* all seems OK */ add_partition(disk, part, start, length); - up(&bdev->bd_sem); + mutex_unlock(&bdev->bd_mutex); return 0; case BLKPG_DEL_PARTITION: if (!disk->part[part-1]) @@ -71,9 +72,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user bdevp = bdget_disk(disk, part); if (!bdevp) return -ENOMEM; - down(&bdevp->bd_sem); + mutex_lock(&bdevp->bd_mutex); if (bdevp->bd_openers) { - up(&bdevp->bd_sem); + mutex_unlock(&bdevp->bd_mutex); bdput(bdevp); return -EBUSY; } @@ -81,10 +82,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user fsync_bdev(bdevp); invalidate_bdev(bdevp, 0); - down(&bdev->bd_sem); + mutex_lock(&bdev->bd_mutex); delete_partition(disk, part); - up(&bdev->bd_sem); - up(&bdevp->bd_sem); + mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdevp->bd_mutex); bdput(bdevp); return 0; @@ -102,10 +103,10 @@ static int blkdev_reread_part(struct block_device *bdev) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (down_trylock(&bdev->bd_sem)) + if (!mutex_trylock(&bdev->bd_mutex)) return -EBUSY; res = rescan_partitions(disk, bdev); - up(&bdev->bd_sem); + mutex_unlock(&bdev->bd_mutex); return res; } @@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev, return put_ulong(arg, bdev->bd_inode->i_size >> 9); case BLKGETSIZE64: return put_u64(arg, bdev->bd_inode->i_size); + case BLKTRACESTART: + case BLKTRACESTOP: + case BLKTRACESETUP: + case BLKTRACETEARDOWN: + return blk_trace_ioctl(bdev, cmd, (char __user *) arg); } return -ENOIOCTLCMD; } diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 0ef2971a9e8..5b26af8597f 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -28,6 +28,7 @@ #include <linux/writeback.h> #include <linux/interrupt.h> #include <linux/cpu.h> +#include <linux/blktrace_api.h> /* * for max sense size @@ -784,6 +785,8 @@ void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b) t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); t->max_segment_size = min(t->max_segment_size,b->max_segment_size); t->hardsect_size = max(t->hardsect_size,b->hardsect_size); + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); } EXPORT_SYMBOL(blk_queue_stack_limits); @@ -905,17 +908,15 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) __FUNCTION__, depth); } - tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); + tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); if (!tag_index) goto fail; nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); + tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); if (!tag_map) goto fail; - memset(tag_index, 0, depth * sizeof(struct request *)); - memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); tags->real_max_depth = depth; tags->max_depth = depth; tags->tag_index = tag_index; @@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q) if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) return; - if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) + if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); + blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); + } } EXPORT_SYMBOL(blk_plug_device); @@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi, /* * devices don't necessarily have an ->unplug_fn defined */ - if (q->unplug_fn) + if (q->unplug_fn) { + blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, + q->rq.count[READ] + q->rq.count[WRITE]); + q->unplug_fn(q); + } } static void blk_unplug_work(void *data) { request_queue_t *q = data; + blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, + q->rq.count[READ] + q->rq.count[WRITE]); + q->unplug_fn(q); } @@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data) { request_queue_t *q = (request_queue_t *)data; + blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, + q->rq.count[READ] + q->rq.count[WRITE]); + kblockd_schedule_work(&q->unplug_work); } @@ -1740,16 +1753,11 @@ EXPORT_SYMBOL(blk_run_queue); * Hopefully the low level driver will have finished any * outstanding requests first... **/ -void blk_cleanup_queue(request_queue_t * q) +static void blk_release_queue(struct kobject *kobj) { + request_queue_t *q = container_of(kobj, struct request_queue, kobj); struct request_list *rl = &q->rq; - if (!atomic_dec_and_test(&q->refcnt)) - return; - - if (q->elevator) - elevator_exit(q->elevator); - blk_sync_queue(q); if (rl->rq_pool) @@ -1758,9 +1766,30 @@ void blk_cleanup_queue(request_queue_t * q) if (q->queue_tags) __blk_queue_free_tags(q); + if (q->blk_trace) + blk_trace_shutdown(q); + kmem_cache_free(requestq_cachep, q); } +void blk_put_queue(request_queue_t *q) +{ + kobject_put(&q->kobj); +} +EXPORT_SYMBOL(blk_put_queue); + +void blk_cleanup_queue(request_queue_t * q) +{ + mutex_lock(&q->sysfs_lock); + set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); + mutex_unlock(&q->sysfs_lock); + + if (q->elevator) + elevator_exit(q->elevator); + + blk_put_queue(q); +} + EXPORT_SYMBOL(blk_cleanup_queue); static int blk_init_free_list(request_queue_t *q) @@ -1788,6 +1817,8 @@ request_queue_t *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); +static struct kobj_type queue_ktype; + request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { request_queue_t *q; @@ -1798,11 +1829,16 @@ request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) memset(q, 0, sizeof(*q)); init_timer(&q->unplug_timer); - atomic_set(&q->refcnt, 1); + + snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); + q->kobj.ktype = &queue_ktype; + kobject_init(&q->kobj); q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; q->backing_dev_info.unplug_io_data = q; + mutex_init(&q->sysfs_lock); + return q; } EXPORT_SYMBOL(blk_alloc_queue_node); @@ -1854,8 +1890,10 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) return NULL; q->node = node_id; - if (blk_init_free_list(q)) - goto out_init; + if (blk_init_free_list(q)) { + kmem_cache_free(requestq_cachep, q); + return NULL; + } /* * if caller didn't supply a lock, they get per-queue locking with @@ -1891,9 +1929,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) return q; } - blk_cleanup_queue(q); -out_init: - kmem_cache_free(requestq_cachep, q); + blk_put_queue(q); return NULL; } EXPORT_SYMBOL(blk_init_queue_node); @@ -1901,7 +1937,7 @@ EXPORT_SYMBOL(blk_init_queue_node); int blk_get_queue(request_queue_t *q) { if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - atomic_inc(&q->refcnt); + kobject_get(&q->kobj); return 0; } @@ -2109,6 +2145,8 @@ rq_starved: rq_init(q, rq); rq->rl = rl; + + blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); out: return rq; } @@ -2137,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw, if (!rq) { struct io_context *ioc; + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); + __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); io_schedule(); @@ -2190,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request); */ void blk_requeue_request(request_queue_t *q, struct request *rq) { + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + if (blk_rq_tagged(rq)) blk_queue_end_tag(q, rq); @@ -2437,10 +2479,12 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->flags |= REQ_NOMERGE; rq->end_io = done; - elv_add_request(q, rq, where, 1); - generic_unplug_device(q); + WARN_ON(irqs_disabled()); + spin_lock_irq(q->queue_lock); + __elv_add_request(q, rq, where, 1); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); } - EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); /** @@ -2824,6 +2868,8 @@ static int __make_request(request_queue_t *q, struct bio *bio) if (!q->back_merge_fn(q, req, bio)) break; + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + req->biotail->bi_next = bio; req->biotail = bio; req->nr_sectors = req->hard_nr_sectors += nr_sectors; @@ -2839,6 +2885,8 @@ static int __make_request(request_queue_t *q, struct bio *bio) if (!q->front_merge_fn(q, req, bio)) break; + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + bio->bi_next = req->bio; req->bio = bio; @@ -2956,6 +3004,7 @@ void generic_make_request(struct bio *bio) request_queue_t *q; sector_t maxsector; int ret, nr_sectors = bio_sectors(bio); + dev_t old_dev; might_sleep(); /* Test device or partition size, when known. */ @@ -2982,6 +3031,8 @@ void generic_make_request(struct bio *bio) * NOTE: we don't repeat the blk_size check for each new device. * Stacking drivers are expected to know what they are doing. */ + maxsector = -1; + old_dev = 0; do { char b[BDEVNAME_SIZE]; @@ -3014,6 +3065,15 @@ end_io: */ blk_partition_remap(bio); + if (maxsector != -1) + blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, + maxsector); + + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + + maxsector = bio->bi_sector; + old_dev = bio->bi_bdev->bd_dev; + ret = q->make_request_fn(q, bio); } while (ret); } @@ -3133,6 +3193,8 @@ static int __end_that_request_first(struct request *req, int uptodate, int total_bytes, bio_nbytes, error, next_idx = 0; struct bio *bio; + blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); + /* * extend uptodate bool to allow < 0 value to be direct io error */ @@ -3452,7 +3514,7 @@ int __init blk_dev_init(void) iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL); - for_each_cpu(i) + for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); @@ -3477,10 +3539,18 @@ void put_io_context(struct io_context *ioc) BUG_ON(atomic_read(&ioc->refcount) == 0); if (atomic_dec_and_test(&ioc->refcount)) { + struct cfq_io_context *cic; + + rcu_read_lock(); if (ioc->aic && ioc->aic->dtor) ioc->aic->dtor(ioc->aic); - if (ioc->cic && ioc->cic->dtor) - ioc->cic->dtor(ioc->cic); + if (ioc->cic_root.rb_node != NULL) { + struct rb_node *n = rb_first(&ioc->cic_root); + + cic = rb_entry(n, struct cfq_io_context, rb_node); + cic->dtor(ioc); + } + rcu_read_unlock(); kmem_cache_free(iocontext_cachep, ioc); } @@ -3492,6 +3562,7 @@ void exit_io_context(void) { unsigned long flags; struct io_context *ioc; + struct cfq_io_context *cic; local_irq_save(flags); task_lock(current); @@ -3503,9 +3574,11 @@ void exit_io_context(void) if (ioc->aic && ioc->aic->exit) ioc->aic->exit(ioc->aic); - if (ioc->cic && ioc->cic->exit) - ioc->cic->exit(ioc->cic); - + if (ioc->cic_root.rb_node != NULL) { + cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node); + cic->exit(ioc); + } + put_io_context(ioc); } @@ -3534,7 +3607,7 @@ struct io_context *current_io_context(gfp_t gfp_flags) ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; - ret->cic = NULL; + ret->cic_root.rb_node = NULL; tsk->io_context = ret; } @@ -3614,10 +3687,13 @@ static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { struct request_list *rl = &q->rq; + unsigned long nr; + int ret = queue_var_store(&nr, page, count); + if (nr < BLKDEV_MIN_RQ) + nr = BLKDEV_MIN_RQ; - int ret = queue_var_store(&q->nr_requests, page, count); - if (q->nr_requests < BLKDEV_MIN_RQ) - q->nr_requests = BLKDEV_MIN_RQ; + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; blk_queue_congestion_threshold(q); if (rl->count[READ] >= queue_congestion_on_threshold(q)) @@ -3643,6 +3719,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) blk_clear_queue_full(q, WRITE); wake_up(&rl->wait[WRITE]); } + spin_unlock_irq(q->queue_lock); return ret; } @@ -3758,13 +3835,19 @@ static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q; + request_queue_t *q = container_of(kobj, struct request_queue, kobj); + ssize_t res; - q = container_of(kobj, struct request_queue, kobj); if (!entry->show) return -EIO; - - return entry->show(q, page); + mutex_lock(&q->sysfs_lock); + if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + mutex_unlock(&q->sysfs_lock); + return -ENOENT; + } + res = entry->show(q, page); + mutex_unlock(&q->sysfs_lock); + return res; } static ssize_t @@ -3772,13 +3855,20 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q; + request_queue_t *q = container_of(kobj, struct request_queue, kobj); + + ssize_t res; - q = container_of(kobj, struct request_queue, kobj); if (!entry->store) return -EIO; - - return entry->store(q, page, length); + mutex_lock(&q->sysfs_lock); + if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + mutex_unlock(&q->sysfs_lock); + return -ENOENT; + } + res = entry->store(q, page, length); + mutex_unlock(&q->sysfs_lock); + return res; } static struct sysfs_ops queue_sysfs_ops = { @@ -3789,6 +3879,7 @@ static struct sysfs_ops queue_sysfs_ops = { static struct kobj_type queue_ktype = { .sysfs_ops = &queue_sysfs_ops, .default_attrs = default_attrs, + .release = blk_release_queue, }; int blk_register_queue(struct gendisk *disk) @@ -3801,19 +3892,17 @@ int blk_register_queue(struct gendisk *disk) return -ENXIO; q->kobj.parent = kobject_get(&disk->kobj); - if (!q->kobj.parent) - return -EBUSY; - snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); - q->kobj.ktype = &queue_ktype; - - ret = kobject_register(&q->kobj); + ret = kobject_add(&q->kobj); if (ret < 0) return ret; + kobject_uevent(&q->kobj, KOBJ_ADD); + ret = elv_register_queue(q); if (ret) { - kobject_unregister(&q->kobj); + kobject_uevent(&q->kobj, KOBJ_REMOVE); + kobject_del(&q->kobj); return ret; } @@ -3827,7 +3916,8 @@ void blk_unregister_queue(struct gendisk *disk) if (q && q->request_fn) { elv_unregister_queue(q); - kobject_unregister(&q->kobj); + kobject_uevent(&q->kobj, KOBJ_REMOVE); + kobject_del(&q->kobj); kobject_put(&disk->kobj); } } |