10 files changed, 1267 insertions, 706 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e1..b6f5f0a7965 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,26 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	depends on SYSFS
+	select RELAY
+	select DEBUG_FS
+	help
+	  Say Y here, if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the user space
+	  support tools needed), fetch the blktrace app from:
+
+	  git://brick.kernel.dk/data/git/blktrace.git
+
+config LSF
+	bool "Support for Large Single Files"
+	depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
+	help
+	  Say Y here if you want to be able to handle very large files (bigger
+	  than 2TB), otherwise say N.
+
+	  If unsure, say Y.
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44..c05de0e0037 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 8da3cf66894..296708ceceb 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -182,6 +182,9 @@ struct as_rq {
 
 static kmem_cache_t *arq_pool;
 
+static atomic_t ioc_count = ATOMIC_INIT(0);
+static struct completion *ioc_gone;
+
 static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
 static void as_antic_stop(struct as_data *ad);
 
@@ -193,6 +196,15 @@ static void as_antic_stop(struct as_data *ad);
 static void free_as_io_context(struct as_io_context *aic)
 {
 	kfree(aic);
+	if (atomic_dec_and_test(&ioc_count) && ioc_gone)
+		complete(ioc_gone);
+}
+
+static void as_trim(struct io_context *ioc)
+{
+	if (ioc->aic)
+		free_as_io_context(ioc->aic);
+	ioc->aic = NULL;
 }
 
 /* Called when the task exits */
@@ -220,6 +232,7 @@ static struct as_io_context *alloc_as_io_context(void)
 		ret->seek_total = 0;
 		ret->seek_samples = 0;
 		ret->seek_mean = 0;
+		atomic_inc(&ioc_count);
 	}
 
 	return ret;
@@ -1696,11 +1709,6 @@ static int as_init_queue(request_queue_t *q, elevator_t *e)
 /*
  * sysfs parts below
  */
-struct as_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct as_data *, char *);
-	ssize_t (*store)(struct as_data *, const char *, size_t);
-};
 
 static ssize_t
 as_var_show(unsigned int var, char *page)
@@ -1717,8 +1725,9 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t as_est_show(struct as_data *ad, char *page)
+static ssize_t est_time_show(elevator_t *e, char *page)
 {
+	struct as_data *ad = e->elevator_data;
 	int pos = 0;
 
 	pos += sprintf(page+pos, "%lu %% exit probability\n",
@@ -1734,21 +1743,23 @@ static ssize_t as_est_show(struct as_data *ad, char *page)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(struct as_data *ad, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)		\
 {								\
+	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 }
-SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
-SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
+SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
-	int ret = as_var_store(__PTR, (page), count);		\
+	struct as_data *ad = e->elevator_data;				\
+	int ret = as_var_store(__PTR, (page), count);			\
 	if (*(__PTR) < (MIN))						\
 		*(__PTR) = (MIN);					\
 	else if (*(__PTR) > (MAX))					\
@@ -1756,90 +1767,26 @@ static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
 	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
 	return ret;							\
 }
-STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
-STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
-STORE_FUNCTION(as_read_batchexpire_store,
+STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
+STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
+STORE_FUNCTION(as_read_batch_expire_store,
 			&ad->batch_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_batchexpire_store,
+STORE_FUNCTION(as_write_batch_expire_store,
 			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
 #undef STORE_FUNCTION
 
-static struct as_fs_entry as_est_entry = {
-	.attr = {.name = "est_time", .mode = S_IRUGO },
-	.show = as_est_show,
-};
-static struct as_fs_entry as_readexpire_entry = {
-	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_readexpire_show,
-	.store = as_readexpire_store,
-};
-static struct as_fs_entry as_writeexpire_entry = {
-	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_writeexpire_show,
-	.store = as_writeexpire_store,
-};
-static struct as_fs_entry as_anticexpire_entry = {
-	.attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_anticexpire_show,
-	.store = as_anticexpire_store,
-};
-static struct as_fs_entry as_read_batchexpire_entry = {
-	.attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_read_batchexpire_show,
-	.store = as_read_batchexpire_store,
-};
-static struct as_fs_entry as_write_batchexpire_entry = {
-	.attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_write_batchexpire_show,
-	.store = as_write_batchexpire_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&as_est_entry.attr,
-	&as_readexpire_entry.attr,
-	&as_writeexpire_entry.attr,
-	&as_anticexpire_entry.attr,
-	&as_read_batchexpire_entry.attr,
-	&as_write_batchexpire_entry.attr,
-	NULL,
-};
-
-#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
-
-static ssize_t
-as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct as_fs_entry *entry = to_as(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-as_attr_store(struct kobject *kobj, struct attribute *attr,
-		    const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct as_fs_entry *entry = to_as(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops as_sysfs_ops = {
-	.show	= as_attr_show,
-	.store	= as_attr_store,
-};
-
-static struct kobj_type as_ktype = {
-	.sysfs_ops	= &as_sysfs_ops,
-	.default_attrs	= default_attrs,
+#define AS_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
+
+static struct elv_fs_entry as_attrs[] = {
+	__ATTR_RO(est_time),
+	AS_ATTR(read_expire),
+	AS_ATTR(write_expire),
+	AS_ATTR(antic_expire),
+	AS_ATTR(read_batch_expire),
+	AS_ATTR(write_batch_expire),
+	__ATTR_NULL
 };
 
 static struct elevator_type iosched_as = {
@@ -1860,9 +1807,10 @@ static struct elevator_type iosched_as = {
 		.elevator_may_queue_fn =	as_may_queue,
 		.elevator_init_fn =		as_init_queue,
 		.elevator_exit_fn =		as_exit_queue,
+		.trim =				as_trim,
 	},
 
-	.elevator_ktype = &as_ktype,
+	.elevator_attrs = as_attrs,
 	.elevator_name = "anticipatory",
 	.elevator_owner = THIS_MODULE,
 };
@@ -1893,7 +1841,13 @@ static int __init as_init(void)
 
 static void __exit as_exit(void)
 {
+	DECLARE_COMPLETION(all_gone);
 	elv_unregister(&iosched_as);
+	ioc_gone = &all_gone;
+	barrier();
+	if (atomic_read(&ioc_count))
+		complete(ioc_gone);
+	synchronize_rcu();
 	kmem_cache_destroy(arq_pool);
 }
 
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 00000000000..36f3a172275
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+
+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
+static unsigned int blktrace_seq __read_mostly = 1;
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	struct blk_io_trace *t;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
+	if (t) {
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->device = bt->dev;
+		t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
+		t->pid = tsk->pid;
+		t->cpu = smp_processor_id();
+		t->pdu_len = sizeof(tsk->comm);
+		memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
+		tsk->btrace_seq = blktrace_seq;
+	}
+}
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
+
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw)	\
+	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw)	\
+	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct blk_io_trace *t;
+	unsigned long flags;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu;
+
+	if (unlikely(bt->trace_state != Blktrace_running))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= bio_act[trace_barrier_bit(rw)];
+	what |= bio_act[trace_sync_bit(rw)];
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes. Once reserved, it's
+	 * enough to get preemption disabled to prevent read of this data
+	 * before we are through filling it. get_cpu()/put_cpu() does this
+	 * for us
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		cpu = smp_processor_id();
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->pid = pid;
+		t->device = bt->dev;
+		t->cpu = cpu;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(__blk_add_trace);
+
+static struct dentry *blk_tree_root;
+static struct mutex blk_tree_mutex;
+static unsigned int root_users;
+
+static inline void blk_remove_root(void)
+{
+	if (blk_tree_root) {
+		debugfs_remove(blk_tree_root);
+		blk_tree_root = NULL;
+	}
+}
+
+static void blk_remove_tree(struct dentry *dir)
+{
+	mutex_lock(&blk_tree_mutex);
+	debugfs_remove(dir);
+	if (--root_users == 0)
+		blk_remove_root();
+	mutex_unlock(&blk_tree_mutex);
+}
+
+static struct dentry *blk_create_tree(const char *blk_name)
+{
+	struct dentry *dir = NULL;
+
+	mutex_lock(&blk_tree_mutex);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			goto err;
+	}
+
+	dir = debugfs_create_dir(blk_name, blk_tree_root);
+	if (dir)
+		root_users++;
+	else
+		blk_remove_root();
+
+err:
+	mutex_unlock(&blk_tree_mutex);
+	return dir;
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	relay_close(bt->rchan);
+	debugfs_remove(bt->dropped_file);
+	blk_remove_tree(bt->dir);
+	free_percpu(bt->sequence);
+	kfree(bt);
+}
+
+static int blk_trace_remove(request_queue_t *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->u.generic_ip;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
+			   char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	char b[BDEVNAME_SIZE];
+	int ret, i;
+
+	if (copy_from_user(&buts, arg, sizeof(buts)))
+		return -EFAULT;
+
+	if (!buts.buf_size || !buts.buf_nr)
+		return -EINVAL;
+
+	strcpy(buts.name, bdevname(bdev, b));
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts.name); i++)
+		if (buts.name[i] == '/')
+			buts.name[i] = '_';
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	ret = -ENOENT;
+	dir = blk_create_tree(buts.name);
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = bdev->bd_dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
+	if (!bt->rchan)
+		goto err;
+	bt->rchan->private_data = bt;
+
+	bt->act_mask = buts.act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts.start_lba;
+	bt->end_lba = buts.end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts.pid;
+	bt->trace_state = Blktrace_setup;
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+err:
+	if (dir)
+		blk_remove_tree(dir);
+	if (bt) {
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		if (bt->sequence)
+			free_percpu(bt->sequence);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+static int blk_trace_startstop(request_queue_t *q, int start)
+{
+	struct blk_trace *bt;
+	int ret;
+
+	if ((bt = q->blk_trace) == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	request_queue_t *q;
+	int ret, start = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		ret = blk_trace_setup(q, bdev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(request_queue_t *q)
+{
+	blk_trace_startstop(q, 0);
+	blk_trace_remove(q);
+}
+
+/*
+ * Average offset over two calls to sched_clock() with a gettimeofday()
+ * in the middle
+ */
+static void blk_check_time(unsigned long long *t)
+{
+	unsigned long long a, b;
+	struct timeval tv;
+
+	a = sched_clock();
+	do_gettimeofday(&tv);
+	b = sched_clock();
+
+	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+	*t -= (a + b) / 2;
+}
+
+static void blk_trace_check_cpu_time(void *data)
+{
+	unsigned long long *t;
+	int cpu = get_cpu();
+
+	t = &per_cpu(blk_trace_cpu_offset, cpu);
+
+	/*
+	 * Just call it twice, hopefully the second call will be cache hot
+	 * and a little more precise
+	 */
+	blk_check_time(t);
+	blk_check_time(t);
+
+	put_cpu();
+}
+
+/*
+ * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
+ * timings
+ */
+static void blk_trace_calibrate_offsets(void)
+{
+	unsigned long flags;
+
+	smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
+	local_irq_save(flags);
+	blk_trace_check_cpu_time(NULL);
+	local_irq_restore(flags);
+}
+
+static void blk_trace_set_ht_offsets(void)
+{
+#if defined(CONFIG_SCHED_SMT)
+	int cpu, i;
+
+	/*
+	 * now make sure HT siblings have the same time offset
+	 */
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		unsigned long long *cpu_off, *sibling_off;
+
+		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
+			if (i == cpu)
+				continue;
+
+			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
+			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
+			*sibling_off = *cpu_off;
+		}
+	}
+	preempt_enable();
+#endif
+}
+
+static __init int blk_trace_init(void)
+{
+	mutex_init(&blk_tree_mutex);
+	blk_trace_calibrate_offsets();
+	blk_trace_set_ht_offsets();
+
+	return 0;
+}
+
+module_init(blk_trace_init);
+
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c8dbe38c81c..67d446de022 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -6,21 +6,13 @@
  *
  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
  */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
 #include <linux/config.h>
 #include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
 #include <linux/hash.h>
 #include <linux/rbtree.h>
-#include <linux/mempool.h>
 #include <linux/ioprio.h>
-#include <linux/writeback.h>
 
 /*
  * tunables
@@ -34,18 +26,14 @@ static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
 static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
-static int cfq_slice_idle = HZ / 100;
+static int cfq_slice_idle = HZ / 70;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
-#define CFQ_KEY_ANY		(0xffff)
 
-/*
- * disable queueing at the driver/hardware level
- */
-static const int cfq_max_depth = 2;
+static DEFINE_RWLOCK(cfq_exit_lock);
 
 /*
  * for the hash of cfqq inside the cfqd
@@ -89,6 +77,9 @@ static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_ioc_pool;
 
+static atomic_t ioc_count = ATOMIC_INIT(0);
+static struct completion *ioc_gone;
+
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
@@ -105,11 +96,12 @@ static kmem_cache_t *cfq_ioc_pool;
 #define cfq_cfqq_sync(cfqq)		\
 	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
 
+#define sample_valid(samples)	((samples) > 80)
+
 /*
  * Per block device queue structure
  */
 struct cfq_data {
-	atomic_t ref;
 	request_queue_t *queue;
 
 	/*
@@ -174,7 +166,8 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
-	unsigned int cfq_max_depth;
+
+	struct list_head cic_list;
 };
 
 /*
@@ -288,7 +281,7 @@ CFQ_CRQ_FNS(is_sync);
 
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
-static void cfq_put_cfqd(struct cfq_data *cfqd);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
 #define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
@@ -345,17 +338,27 @@ static int cfq_queue_empty(request_queue_t *q)
 	return !cfqd->busy_queues;
 }
 
+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
+{
+	if (rw == READ || process_sync(task))
+		return task->pid;
+
+	return CFQ_KEY_ASYNC;
+}
+
 /*
  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
- * behind the head are penalized and only allowed to a certain extent.
+ * behind the head is penalized and only allowed to a certain extent.
  */
 static struct cfq_rq *
 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 {
 	sector_t last, s1, s2, d1 = 0, d2 = 0;
-	int r1_wrap = 0, r2_wrap = 0;	/* requests are behind the disk head */
 	unsigned long back_max;
+#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 
 	if (crq1 == NULL || crq1 == crq2)
 		return crq2;
@@ -387,35 +390,47 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 	else if (s1 + back_max >= last)
 		d1 = (last - s1) * cfqd->cfq_back_penalty;
 	else
-		r1_wrap = 1;
+		wrap |= CFQ_RQ1_WRAP;
 
 	if (s2 >= last)
 		d2 = s2 - last;
 	else if (s2 + back_max >= last)
 		d2 = (last - s2) * cfqd->cfq_back_penalty;
 	else
-		r2_wrap = 1;
+		wrap |= CFQ_RQ2_WRAP;
 
 	/* Found required data */
-	if (!r1_wrap && r2_wrap)
-		return crq1;
-	else if (!r2_wrap && r1_wrap)
-		return crq2;
-	else if (r1_wrap && r2_wrap) {
-		/* both behind the head */
-		if (s1 <= s2)
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
+		if (d1 < d2)
 			return crq1;
-		else
+		else if (d2 < d1)
 			return crq2;
-	}
+		else {
+			if (s1 >= s2)
+				return crq1;
+			else
+				return crq2;
+		}
 
-	/* Both requests in front of the head */
-	if (d1 < d2)
+	case CFQ_RQ2_WRAP:
 		return crq1;
-	else if (d2 < d1)
+	case CFQ_RQ1_WRAP:
 		return crq2;
-	else {
-		if (s1 >= s2)
+	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
 			return crq1;
 		else
 			return crq2;
@@ -614,15 +629,20 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	cfq_add_crq_rb(crq);
 }
 
-static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-
+static struct request *
+cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
+	struct task_struct *tsk = current;
+	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
+	struct cfq_queue *cfqq;
 	struct rb_node *n;
+	sector_t sector;
 
+	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
 	if (!cfqq)
 		goto out;
 
+	sector = bio->bi_sector + bio_sectors(bio);
 	n = cfqq->sort_list.rb_node;
 	while (n) {
 		struct cfq_rq *crq = rb_entry_crq(n);
@@ -676,7 +696,7 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 		goto out;
 	}
 
-	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
+	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		ret = ELEVATOR_FRONT_MERGE;
 		goto out;
@@ -879,6 +899,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 {
+	struct cfq_io_context *cic;
 	unsigned long sl;
 
 	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
@@ -894,13 +915,23 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	/*
 	 * task has exited, don't wait
 	 */
-	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
+	cic = cfqd->active_cic;
+	if (!cic || !cic->ioc->task)
 		return 0;
 
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
 	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
+
+	/*
+	 * we don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. so allow a little bit of time for him to submit a new rq
+	 */
+	if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072)
+		sl = 2;
+
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	return 1;
 }
@@ -1117,13 +1148,6 @@ cfq_dispatch_requests(request_queue_t *q, int force)
 	if (cfqq) {
 		int max_dispatch;
 
-		/*
-		 * if idle window is disabled, allow queue buildup
-		 */
-		if (!cfq_cfqq_idle_window(cfqq) &&
-		    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
-			return 0;
-
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
@@ -1160,8 +1184,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	if (unlikely(cfqd->active_queue == cfqq))
 		__cfq_slice_expired(cfqd, cfqq, 0);
 
-	cfq_put_cfqd(cfqq->cfqd);
-
 	/*
 	 * it's on the empty list and still hashed
 	 */
@@ -1175,13 +1197,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
 		    const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct hlist_node *entry, *next;
+	struct hlist_node *entry;
+	struct cfq_queue *__cfqq;
 
-	hlist_for_each_safe(entry, next, hash_list) {
-		struct cfq_queue *__cfqq = list_entry_qhash(entry);
-		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
+	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
+		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
 
-		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
+		if (__cfqq->key == key && (__p == prio || !prio))
 			return __cfqq;
 	}
 
@@ -1194,17 +1216,27 @@ cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
 	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
-static void cfq_free_io_context(struct cfq_io_context *cic)
+static void cfq_free_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry, *next;
+	struct rb_node *n;
+	int freed = 0;
 
-	list_for_each_safe(entry, next, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	while ((n = rb_first(&ioc->cic_root)) != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+		rb_erase(&__cic->rb_node, &ioc->cic_root);
 		kmem_cache_free(cfq_ioc_pool, __cic);
+		freed++;
 	}
 
-	kmem_cache_free(cfq_ioc_pool, cic);
+	if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
+		complete(ioc_gone);
+}
+
+static void cfq_trim(struct io_context *ioc)
+{
+	ioc->set_ioprio = NULL;
+	cfq_free_io_context(ioc);
 }
 
 /*
@@ -1212,43 +1244,57 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
  */
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
-	struct cfq_data *cfqd = cic->cfqq->cfqd;
-	request_queue_t *q = cfqd->queue;
+	struct cfq_data *cfqd = cic->key;
+	request_queue_t *q;
+
+	if (!cfqd)
+		return;
+
+	q = cfqd->queue;
 
 	WARN_ON(!irqs_disabled());
 
 	spin_lock(q->queue_lock);
 
-	if (unlikely(cic->cfqq == cfqd->active_queue))
-		__cfq_slice_expired(cfqd, cic->cfqq, 0);
+	if (cic->cfqq[ASYNC]) {
+		if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
+			__cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
+		cfq_put_queue(cic->cfqq[ASYNC]);
+		cic->cfqq[ASYNC] = NULL;
+	}
 
-	cfq_put_queue(cic->cfqq);
-	cic->cfqq = NULL;
+	if (cic->cfqq[SYNC]) {
+		if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
+			__cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
+		cfq_put_queue(cic->cfqq[SYNC]);
+		cic->cfqq[SYNC] = NULL;
+	}
+
+	cic->key = NULL;
+	list_del_init(&cic->queue_list);
 	spin_unlock(q->queue_lock);
 }
 
-/*
- * Another task may update the task cic list, if it is doing a queue lookup
- * on its behalf. cfq_cic_lock excludes such concurrent updates
- */
-static void cfq_exit_io_context(struct cfq_io_context *cic)
+static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry;
 	unsigned long flags;
-
-	local_irq_save(flags);
+	struct rb_node *n;
 
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	list_for_each(entry, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	read_lock_irqsave(&cfq_exit_lock, flags);
+
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+
 		cfq_exit_single_io_context(__cic);
+		n = rb_next(n);
 	}
 
-	cfq_exit_single_io_context(cic);
-	local_irq_restore(flags);
+	read_unlock_irqrestore(&cfq_exit_lock, flags);
 }
 
 static struct cfq_io_context *
@@ -1257,15 +1303,18 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
 
 	if (cic) {
-		INIT_LIST_HEAD(&cic->list);
-		cic->cfqq = NULL;
+		RB_CLEAR(&cic->rb_node);
 		cic->key = NULL;
+		cic->cfqq[ASYNC] = NULL;
+		cic->cfqq[SYNC] = NULL;
 		cic->last_end_request = jiffies;
 		cic->ttime_total = 0;
 		cic->ttime_samples = 0;
 		cic->ttime_mean = 0;
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
+		INIT_LIST_HEAD(&cic->queue_list);
+		atomic_inc(&ioc_count);
 	}
 
 	return cic;
@@ -1318,14 +1367,27 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static inline void changed_ioprio(struct cfq_queue *cfqq)
+static inline void changed_ioprio(struct cfq_io_context *cic)
 {
-	if (cfqq) {
-		struct cfq_data *cfqd = cfqq->cfqd;
-
+	struct cfq_data *cfqd = cic->key;
+	struct cfq_queue *cfqq;
+	if (cfqd) {
 		spin_lock(cfqd->queue->queue_lock);
-		cfq_mark_cfqq_prio_changed(cfqq);
-		cfq_init_prio_data(cfqq);
+		cfqq = cic->cfqq[ASYNC];
+		if (cfqq) {
+			struct cfq_queue *new_cfqq;
+			new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC,
+						cic->ioc->task, GFP_ATOMIC);
+			if (new_cfqq) {
+				cic->cfqq[ASYNC] = new_cfqq;
+				cfq_put_queue(cfqq);
+			}
+		}
+		cfqq = cic->cfqq[SYNC];
+		if (cfqq) {
+			cfq_mark_cfqq_prio_changed(cfqq);
+			cfq_init_prio_data(cfqq);
+		}
 		spin_unlock(cfqd->queue->queue_lock);
 	}
 }
@@ -1335,24 +1397,34 @@ static inline void changed_ioprio(struct cfq_queue *cfqq)
  */
 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 {
-	struct cfq_io_context *cic = ioc->cic;
+	struct cfq_io_context *cic;
+	struct rb_node *n;
 
-	changed_ioprio(cic->cfqq);
+	write_lock(&cfq_exit_lock);
 
-	list_for_each_entry(cic, &cic->list, list)
-		changed_ioprio(cic->cfqq);
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+ 
+		changed_ioprio(cic);
+		n = rb_next(n);
+	}
+
+	write_unlock(&cfq_exit_lock);
 
 	return 0;
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
 	      gfp_t gfp_mask)
 {
 	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	unsigned short ioprio;
 
 retry:
+	ioprio = tsk->ioprio;
 	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
 
 	if (!cfqq) {
@@ -1381,7 +1453,6 @@ retry:
 		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 		atomic_set(&cfqq->ref, 0);
 		cfqq->cfqd = cfqd;
-		atomic_inc(&cfqd->ref);
 		cfqq->service_last = 0;
 		/*
 		 * set ->slice_left to allow preemption for a new process
@@ -1401,14 +1472,67 @@ out:
 	return cfqq;
 }
 
+static struct cfq_io_context *
+cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+{
+	struct rb_node *n = ioc->cic_root.rb_node;
+	struct cfq_io_context *cic;
+	void *key = cfqd;
+
+	while (n) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+
+		if (key < cic->key)
+			n = n->rb_left;
+		else if (key > cic->key)
+			n = n->rb_right;
+		else
+			return cic;
+	}
+
+	return NULL;
+}
+
+static inline void
+cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
+	     struct cfq_io_context *cic)
+{
+	struct rb_node **p = &ioc->cic_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_io_context *__cic;
+
+	read_lock(&cfq_exit_lock);
+
+	cic->ioc = ioc;
+	cic->key = cfqd;
+
+	ioc->set_ioprio = cfq_ioc_set_ioprio;
+
+	while (*p) {
+		parent = *p;
+		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
+
+		if (cic->key < __cic->key)
+			p = &(*p)->rb_left;
+		else if (cic->key > __cic->key)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&cic->rb_node, parent, p);
+	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+	list_add(&cic->queue_list, &cfqd->cic_list);
+	read_unlock(&cfq_exit_lock);
+}
+
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq. Note that caller is holding a reference to
- * cfqq, so we don't need to worry about it disappearing
+ * than one device managed by cfq.
  */
 static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
+cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
@@ -1419,61 +1543,15 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
-	if ((cic = ioc->cic) == NULL) {
-		cic = cfq_alloc_io_context(cfqd, gfp_mask);
-
-		if (cic == NULL)
-			goto err;
-
-		/*
-		 * manually increment generic io_context usage count, it
-		 * cannot go away since we are already holding one ref to it
-		 */
-		ioc->cic = cic;
-		ioc->set_ioprio = cfq_ioc_set_ioprio;
-		cic->ioc = ioc;
-		cic->key = cfqd;
-		atomic_inc(&cfqd->ref);
-	} else {
-		struct cfq_io_context *__cic;
-
-		/*
-		 * the first cic on the list is actually the head itself
-		 */
-		if (cic->key == cfqd)
-			goto out;
-
-		/*
-		 * cic exists, check if we already are there. linear search
-		 * should be ok here, the list will usually not be more than
-		 * 1 or a few entries long
-		 */
-		list_for_each_entry(__cic, &cic->list, list) {
-			/*
-			 * this process is already holding a reference to
-			 * this queue, so no need to get one more
-			 */
-			if (__cic->key == cfqd) {
-				cic = __cic;
-				goto out;
-			}
-		}
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	if (cic)
+		goto out;
 
-		/*
-		 * nope, process doesn't have a cic assoicated with this
-		 * cfqq yet. get a new one and add to list
-		 */
-		__cic = cfq_alloc_io_context(cfqd, gfp_mask);
-		if (__cic == NULL)
-			goto err;
-
-		__cic->ioc = ioc;
-		__cic->key = cfqd;
-		atomic_inc(&cfqd->ref);
-		list_add(&__cic->list, &cic->list);
-		cic = __cic;
-	}
+	cic = cfq_alloc_io_context(cfqd, gfp_mask);
+	if (cic == NULL)
+		goto err;
 
+	cfq_cic_link(cfqd, ioc, cic);
 out:
 	return cic;
 err:
@@ -1506,7 +1584,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
 }
 
-#define sample_valid(samples)	((samples) > 80)
+static void
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		       struct cfq_rq *crq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (cic->last_request_pos < crq->request->sector)
+		sdist = crq->request->sector - cic->last_request_pos;
+	else
+		sdist = cic->last_request_pos - crq->request->sector;
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc
+	 */
+	if (cic->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
+
+	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
+	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+	total = cic->seek_total + (cic->seek_samples/2);
+	do_div(total, cic->seek_samples);
+	cic->seek_mean = (sector_t)total;
+}
 
 /*
  * Disable idle window if the process thinks too long or seeks so much that
@@ -1619,9 +1723,11 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cic = crq->io_context;
 
 	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_io_seektime(cfqd, cic, crq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_queue = jiffies;
+	cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1754,14 +1860,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		cfq_resort_rr_list(cfqq, 0);
 }
 
-static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
-{
-	if (rw == READ || process_sync(task))
-		return task->pid;
-
-	return CFQ_KEY_ASYNC;
-}
-
 static inline int
 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct task_struct *task, int rw)
@@ -1890,24 +1988,25 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	unsigned long flags;
+	int is_sync = key != CFQ_KEY_ASYNC;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	cic = cfq_get_io_context(cfqd, key, gfp_mask);
+	cic = cfq_get_io_context(cfqd, gfp_mask);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	if (!cic)
 		goto queue_fail;
 
-	if (!cic->cfqq) {
-		cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
+	if (!cic->cfqq[is_sync]) {
+		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
 		if (!cfqq)
 			goto queue_fail;
 
-		cic->cfqq = cfqq;
+		cic->cfqq[is_sync] = cfqq;
 	} else
-		cfqq = cic->cfqq;
+		cfqq = cic->cfqq[is_sync];
 
 	cfqq->allocated[rw]++;
 	cfq_clear_cfqq_must_alloc(cfqq);
@@ -1924,7 +2023,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
 
-		if (rw == READ || process_sync(tsk))
+		if (is_sync)
 			cfq_mark_crq_is_sync(crq);
 		else
 			cfq_clear_crq_is_sync(crq);
@@ -2055,15 +2154,39 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 	blk_sync_queue(cfqd->queue);
 }
 
-static void cfq_put_cfqd(struct cfq_data *cfqd)
+static void cfq_exit_queue(elevator_t *e)
 {
+	struct cfq_data *cfqd = e->elevator_data;
 	request_queue_t *q = cfqd->queue;
 
-	if (!atomic_dec_and_test(&cfqd->ref))
-		return;
+	cfq_shutdown_timer_wq(cfqd);
+
+	write_lock(&cfq_exit_lock);
+	spin_lock_irq(q->queue_lock);
+
+	if (cfqd->active_queue)
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
+
+	while (!list_empty(&cfqd->cic_list)) {
+		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
+							struct cfq_io_context,
+							queue_list);
+		if (cic->cfqq[ASYNC]) {
+			cfq_put_queue(cic->cfqq[ASYNC]);
+			cic->cfqq[ASYNC] = NULL;
+		}
+		if (cic->cfqq[SYNC]) {
+			cfq_put_queue(cic->cfqq[SYNC]);
+			cic->cfqq[SYNC] = NULL;
+		}
+		cic->key = NULL;
+		list_del_init(&cic->queue_list);
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	write_unlock(&cfq_exit_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
-	blk_put_queue(q);
 
 	mempool_destroy(cfqd->crq_pool);
 	kfree(cfqd->crq_hash);
@@ -2071,14 +2194,6 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	kfree(cfqd);
 }
 
-static void cfq_exit_queue(elevator_t *e)
-{
-	struct cfq_data *cfqd = e->elevator_data;
-
-	cfq_shutdown_timer_wq(cfqd);
-	cfq_put_cfqd(cfqd);
-}
-
 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct cfq_data *cfqd;
@@ -2097,6 +2212,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);
+	INIT_LIST_HEAD(&cfqd->cic_list);
 
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
@@ -2106,7 +2222,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	if (!cfqd->cfq_hash)
 		goto out_cfqhash;
 
-	cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
+	cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
 	if (!cfqd->crq_pool)
 		goto out_crqpool;
 
@@ -2118,7 +2234,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	e->elevator_data = cfqd;
 
 	cfqd->queue = q;
-	atomic_inc(&q->refcnt);
 
 	cfqd->max_queued = q->nr_requests / 4;
 	q->nr_batching = cfq_queued;
@@ -2133,8 +2248,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
 
-	atomic_set(&cfqd->ref, 1);
-
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
@@ -2145,7 +2258,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
-	cfqd->cfq_max_depth = cfq_max_depth;
 
 	return 0;
 out_crqpool:
@@ -2193,11 +2305,6 @@ fail:
 /*
  * sysfs parts below -->
  */
-struct cfq_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct cfq_data *, char *);
-	ssize_t (*store)(struct cfq_data *, const char *, size_t);
-};
 
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
@@ -2215,8 +2322,9 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)			\
 {									\
+	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
@@ -2226,18 +2334,18 @@ SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
-SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
-SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
+SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
+SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
-SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
+	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
@@ -2254,121 +2362,29 @@ STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
-STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
-static struct cfq_fs_entry cfq_quantum_entry = {
-	.attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_quantum_show,
-	.store = cfq_quantum_store,
-};
-static struct cfq_fs_entry cfq_queued_entry = {
-	.attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_queued_show,
-	.store = cfq_queued_store,
-};
-static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
-	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_sync_show,
-	.store = cfq_fifo_expire_sync_store,
-};
-static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
-	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_async_show,
-	.store = cfq_fifo_expire_async_store,
-};
-static struct cfq_fs_entry cfq_back_max_entry = {
-	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_back_max_show,
-	.store = cfq_back_max_store,
-};
-static struct cfq_fs_entry cfq_back_penalty_entry = {
-	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_back_penalty_show,
-	.store = cfq_back_penalty_store,
-};
-static struct cfq_fs_entry cfq_slice_sync_entry = {
-	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_sync_show,
-	.store = cfq_slice_sync_store,
-};
-static struct cfq_fs_entry cfq_slice_async_entry = {
-	.attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_async_show,
-	.store = cfq_slice_async_store,
-};
-static struct cfq_fs_entry cfq_slice_async_rq_entry = {
-	.attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_async_rq_show,
-	.store = cfq_slice_async_rq_store,
-};
-static struct cfq_fs_entry cfq_slice_idle_entry = {
-	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_idle_show,
-	.store = cfq_slice_idle_store,
-};
-static struct cfq_fs_entry cfq_max_depth_entry = {
-	.attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_max_depth_show,
-	.store = cfq_max_depth_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&cfq_quantum_entry.attr,
-	&cfq_queued_entry.attr,
-	&cfq_fifo_expire_sync_entry.attr,
-	&cfq_fifo_expire_async_entry.attr,
-	&cfq_back_max_entry.attr,
-	&cfq_back_penalty_entry.attr,
-	&cfq_slice_sync_entry.attr,
-	&cfq_slice_async_entry.attr,
-	&cfq_slice_async_rq_entry.attr,
-	&cfq_slice_idle_entry.attr,
-	&cfq_max_depth_entry.attr,
-	NULL,
-};
-
-#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
-
-static ssize_t
-cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct cfq_fs_entry *entry = to_cfq(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-cfq_attr_store(struct kobject *kobj, struct attribute *attr,
-	       const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct cfq_fs_entry *entry = to_cfq(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops cfq_sysfs_ops = {
-	.show	= cfq_attr_show,
-	.store	= cfq_attr_store,
-};
-
-static struct kobj_type cfq_ktype = {
-	.sysfs_ops	= &cfq_sysfs_ops,
-	.default_attrs	= default_attrs,
+#define CFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
+
+static struct elv_fs_entry cfq_attrs[] = {
+	CFQ_ATTR(quantum),
+	CFQ_ATTR(queued),
+	CFQ_ATTR(fifo_expire_sync),
+	CFQ_ATTR(fifo_expire_async),
+	CFQ_ATTR(back_seek_max),
+	CFQ_ATTR(back_seek_penalty),
+	CFQ_ATTR(slice_sync),
+	CFQ_ATTR(slice_async),
+	CFQ_ATTR(slice_async_rq),
+	CFQ_ATTR(slice_idle),
+	__ATTR_NULL
 };
 
 static struct elevator_type iosched_cfq = {
@@ -2389,8 +2405,9 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.trim =				cfq_trim,
 	},
-	.elevator_ktype =	&cfq_ktype,
+	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
@@ -2419,7 +2436,13 @@ static int __init cfq_init(void)
 
 static void __exit cfq_exit(void)
 {
+	DECLARE_COMPLETION(all_gone);
 	elv_unregister(&iosched_cfq);
+	ioc_gone = &all_gone;
+	barrier();
+	if (atomic_read(&ioc_count))
+		complete(ioc_gone);
+	synchronize_rcu();
 	cfq_slab_kill();
 }
 
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 27e494b1bf9..399fa1e60e1 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -694,11 +694,6 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 /*
  * sysfs parts below
  */
-struct deadline_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct deadline_data *, char *);
-	ssize_t (*store)(struct deadline_data *, const char *, size_t);
-};
 
 static ssize_t
 deadline_var_show(int var, char *page)
@@ -716,23 +711,25 @@ deadline_var_store(int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct deadline_data *dd, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)			\
 {									\
-	int __data = __VAR;					\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 	return deadline_var_show(__data, (page));			\
 }
-SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
+	struct deadline_data *dd = e->elevator_data;			\
 	int __data;							\
 	int ret = deadline_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
@@ -745,83 +742,24 @@ static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)
 		*(__PTR) = __data;					\
 	return ret;							\
 }
-STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
-static struct deadline_fs_entry deadline_readexpire_entry = {
-	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_readexpire_show,
-	.store = deadline_readexpire_store,
-};
-static struct deadline_fs_entry deadline_writeexpire_entry = {
-	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_writeexpire_show,
-	.store = deadline_writeexpire_store,
-};
-static struct deadline_fs_entry deadline_writesstarved_entry = {
-	.attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_writesstarved_show,
-	.store = deadline_writesstarved_store,
-};
-static struct deadline_fs_entry deadline_frontmerges_entry = {
-	.attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_frontmerges_show,
-	.store = deadline_frontmerges_store,
-};
-static struct deadline_fs_entry deadline_fifobatch_entry = {
-	.attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_fifobatch_show,
-	.store = deadline_fifobatch_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&deadline_readexpire_entry.attr,
-	&deadline_writeexpire_entry.attr,
-	&deadline_writesstarved_entry.attr,
-	&deadline_frontmerges_entry.attr,
-	&deadline_fifobatch_entry.attr,
-	NULL,
-};
-
-#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr)
-
-static ssize_t
-deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct deadline_fs_entry *entry = to_deadline(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-deadline_attr_store(struct kobject *kobj, struct attribute *attr,
-		    const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct deadline_fs_entry *entry = to_deadline(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops deadline_sysfs_ops = {
-	.show	= deadline_attr_show,
-	.store	= deadline_attr_store,
-};
-
-static struct kobj_type deadline_ktype = {
-	.sysfs_ops	= &deadline_sysfs_ops,
-	.default_attrs	= default_attrs,
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
 };
 
 static struct elevator_type iosched_deadline = {
@@ -840,7 +778,7 @@ static struct elevator_type iosched_deadline = {
 		.elevator_exit_fn =		deadline_exit_queue,
 	},
 
-	.elevator_ktype = &deadline_ktype,
+	.elevator_attrs = deadline_attrs,
 	.elevator_name = "deadline",
 	.elevator_owner = THIS_MODULE,
 };
diff --git a/block/elevator.c b/block/elevator.c
index 24b702d649a..0d6be03d929 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/blktrace_api.h>
 
 #include <asm/uaccess.h>
 
@@ -120,15 +121,10 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 }
 
-static int elevator_attach(request_queue_t *q, struct elevator_type *e,
-			   struct elevator_queue *eq)
+static int elevator_attach(request_queue_t *q, struct elevator_queue *eq)
 {
 	int ret = 0;
 
-	memset(eq, 0, sizeof(*eq));
-	eq->ops = &e->ops;
-	eq->elevator_type = e;
-
 	q->elevator = eq;
 
 	if (eq->ops->elevator_init_fn)
@@ -149,11 +145,37 @@ static int __init elevator_setup(char *str)
 		strcpy(chosen_elevator, "anticipatory");
 	else
 		strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
-	return 0;
+	return 1;
 }
 
 __setup("elevator=", elevator_setup);
 
+static struct kobj_type elv_ktype;
+
+static elevator_t *elevator_alloc(struct elevator_type *e)
+{
+	elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	if (eq) {
+		memset(eq, 0, sizeof(*eq));
+		eq->ops = &e->ops;
+		eq->elevator_type = e;
+		kobject_init(&eq->kobj);
+		snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
+		eq->kobj.ktype = &elv_ktype;
+		mutex_init(&eq->sysfs_lock);
+	} else {
+		elevator_put(e);
+	}
+	return eq;
+}
+
+static void elevator_release(struct kobject *kobj)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	elevator_put(e->elevator_type);
+	kfree(e);
+}
+
 int elevator_init(request_queue_t *q, char *name)
 {
 	struct elevator_type *e = NULL;
@@ -176,29 +198,26 @@ int elevator_init(request_queue_t *q, char *name)
 		e = elevator_get("noop");
 	}
 
-	eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
-	if (!eq) {
-		elevator_put(e);
+	eq = elevator_alloc(e);
+	if (!eq)
 		return -ENOMEM;
-	}
 
-	ret = elevator_attach(q, e, eq);
-	if (ret) {
-		kfree(eq);
-		elevator_put(e);
-	}
+	ret = elevator_attach(q, eq);
+	if (ret)
+		kobject_put(&eq->kobj);
 
 	return ret;
 }
 
 void elevator_exit(elevator_t *e)
 {
+	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
+	e->ops = NULL;
+	mutex_unlock(&e->sysfs_lock);
 
-	elevator_put(e->elevator_type);
-	e->elevator_type = NULL;
-	kfree(e);
+	kobject_put(&e->kobj);
 }
 
 /*
@@ -315,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
 	struct list_head *pos;
 	unsigned ordseq;
 
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+
 	rq->q = q;
 
 	switch (where) {
@@ -481,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->flags |= REQ_STARTED;
+			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -627,34 +649,86 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	}
 }
 
-int elv_register_queue(struct request_queue *q)
+#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
+
+static ssize_t
+elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
-	elevator_t *e = q->elevator;
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elv_fs_entry *entry = to_elv(attr);
+	ssize_t error;
+
+	if (!entry->show)
+		return -EIO;
+
+	mutex_lock(&e->sysfs_lock);
+	error = e->ops ? entry->show(e, page) : -ENOENT;
+	mutex_unlock(&e->sysfs_lock);
+	return error;
+}
 
-	e->kobj.parent = kobject_get(&q->kobj);
-	if (!e->kobj.parent)
-		return -EBUSY;
+static ssize_t
+elv_attr_store(struct kobject *kobj, struct attribute *attr,
+	       const char *page, size_t length)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elv_fs_entry *entry = to_elv(attr);
+	ssize_t error;
 
-	snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-	e->kobj.ktype = e->elevator_type->elevator_ktype;
+	if (!entry->store)
+		return -EIO;
 
-	return kobject_register(&e->kobj);
+	mutex_lock(&e->sysfs_lock);
+	error = e->ops ? entry->store(e, page, length) : -ENOENT;
+	mutex_unlock(&e->sysfs_lock);
+	return error;
+}
+
+static struct sysfs_ops elv_sysfs_ops = {
+	.show	= elv_attr_show,
+	.store	= elv_attr_store,
+};
+
+static struct kobj_type elv_ktype = {
+	.sysfs_ops	= &elv_sysfs_ops,
+	.release	= elevator_release,
+};
+
+int elv_register_queue(struct request_queue *q)
+{
+	elevator_t *e = q->elevator;
+	int error;
+
+	e->kobj.parent = &q->kobj;
+
+	error = kobject_add(&e->kobj);
+	if (!error) {
+		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
+		if (attr) {
+			while (attr->attr.name) {
+				if (sysfs_create_file(&e->kobj, &attr->attr))
+					break;
+				attr++;
+			}
+		}
+		kobject_uevent(&e->kobj, KOBJ_ADD);
+	}
+	return error;
 }
 
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q) {
 		elevator_t *e = q->elevator;
-		kobject_unregister(&e->kobj);
-		kobject_put(&q->kobj);
+		kobject_uevent(&e->kobj, KOBJ_REMOVE);
+		kobject_del(&e->kobj);
 	}
 }
 
 int elv_register(struct elevator_type *e)
 {
 	spin_lock_irq(&elv_list_lock);
-	if (elevator_find(e->elevator_name))
-		BUG();
+	BUG_ON(elevator_find(e->elevator_name));
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock_irq(&elv_list_lock);
 
@@ -675,21 +749,15 @@ void elv_unregister(struct elevator_type *e)
 	/*
 	 * Iterate every thread in the process to remove the io contexts.
 	 */
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		struct io_context *ioc = p->io_context;
-		if (ioc && ioc->cic) {
-			ioc->cic->exit(ioc->cic);
-			ioc->cic->dtor(ioc->cic);
-			ioc->cic = NULL;
-		}
-		if (ioc && ioc->aic) {
-			ioc->aic->exit(ioc->aic);
-			ioc->aic->dtor(ioc->aic);
-			ioc->aic = NULL;
-		}
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
+	if (e->ops.trim) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			task_lock(p);
+			e->ops.trim(p->io_context);
+			task_unlock(p);
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+	}
 
 	spin_lock_irq(&elv_list_lock);
 	list_del_init(&e->list);
@@ -703,16 +771,16 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  * need for the new one. this way we have a chance of going back to the old
  * one, if the new one fails init for some reason.
  */
-static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
+static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
 	elevator_t *old_elevator, *e;
 
 	/*
 	 * Allocate new elevator
 	 */
-	e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	e = elevator_alloc(new_e);
 	if (!e)
-		goto error;
+		return 0;
 
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
@@ -743,7 +811,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	/*
 	 * attach and start new elevator
 	 */
-	if (elevator_attach(q, new_e, e))
+	if (elevator_attach(q, e))
 		goto fail;
 
 	if (elv_register_queue(q))
@@ -754,7 +822,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	 */
 	elevator_exit(old_elevator);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	return;
+	return 1;
 
 fail_register:
 	/*
@@ -767,10 +835,9 @@ fail:
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	kfree(e);
-error:
-	elevator_put(new_e);
-	printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
+	if (e)
+		kobject_put(&e->kobj);
+	return 0;
 }
 
 ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
@@ -797,7 +864,8 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
 		return count;
 	}
 
-	elevator_switch(q, e);
+	if (!elevator_switch(q, e))
+		printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
 	return count;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index db57546a709..5a8d3bf02f1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,12 +15,11 @@
 #include <linux/kmod.h>
 #include <linux/kobj_map.h>
 #include <linux/buffer_head.h>
-
-#define MAX_PROBE_HASH 255	/* random */
+#include <linux/mutex.h>
 
 static struct subsystem block_subsys;
 
-static DECLARE_MUTEX(block_subsys_sem);
+static DEFINE_MUTEX(block_subsys_lock);
 
 /*
  * Can be deleted altogether. Later.
@@ -30,115 +29,36 @@ static struct blk_major_name {
 	struct blk_major_name *next;
 	int major;
 	char name[16];
-} *major_names[MAX_PROBE_HASH];
+} *major_names[BLKDEV_MAJOR_HASH_SIZE];
 
 /* index in the above - for now: assume no multimajor ranges */
 static inline int major_to_index(int major)
 {
-	return major % MAX_PROBE_HASH;
-}
-
-struct blkdev_info {
-        int index;
-        struct blk_major_name *bd;
-};
-
-/*
- * iterate over a list of blkdev_info structures.  allows
- * the major_names array to be iterated over from outside this file
- * must be called with the block_subsys_sem held
- */
-void *get_next_blkdev(void *dev)
-{
-        struct blkdev_info *info;
-
-        if (dev == NULL) {
-                info = kmalloc(sizeof(*info), GFP_KERNEL);
-                if (!info)
-                        goto out;
-                info->index=0;
-                info->bd = major_names[info->index];
-                if (info->bd)
-                        goto out;
-        } else {
-                info = dev;
-        }
-
-        while (info->index < ARRAY_SIZE(major_names)) {
-                if (info->bd)
-                        info->bd = info->bd->next;
-                if (info->bd)
-                        goto out;
-                /*
-                 * No devices on this chain, move to the next
-                 */
-                info->index++;
-                info->bd = (info->index < ARRAY_SIZE(major_names)) ?
-			major_names[info->index] : NULL;
-                if (info->bd)
-                        goto out;
-        }
-
-out:
-        return info;
-}
-
-void *acquire_blkdev_list(void)
-{
-        down(&block_subsys_sem);
-        return get_next_blkdev(NULL);
-}
-
-void release_blkdev_list(void *dev)
-{
-        up(&block_subsys_sem);
-        kfree(dev);
+	return major % BLKDEV_MAJOR_HASH_SIZE;
 }
 
+#ifdef CONFIG_PROC_FS
 
-/*
- * Count the number of records in the blkdev_list.
- * must be called with the block_subsys_sem held
- */
-int count_blkdev_list(void)
+void blkdev_show(struct seq_file *f, off_t offset)
 {
-	struct blk_major_name *n;
-	int i, count;
+	struct blk_major_name *dp;
 
-	count = 0;
-
-	for (i = 0; i < ARRAY_SIZE(major_names); i++) {
-		for (n = major_names[i]; n; n = n->next)
-				count++;
+	if (offset < BLKDEV_MAJOR_HASH_SIZE) {
+		mutex_lock(&block_subsys_lock);
+		for (dp = major_names[offset]; dp; dp = dp->next)
+			seq_printf(f, "%3d %s\n", dp->major, dp->name);
+		mutex_unlock(&block_subsys_lock);
 	}
-
-	return count;
-}
-
-/*
- * extract the major and name values from a blkdev_info struct
- * passed in as a void to *dev.  Must be called with
- * block_subsys_sem held
- */
-int get_blkdev_info(void *dev, int *major, char **name)
-{
-        struct blkdev_info *info = dev;
-
-        if (info->bd == NULL)
-                return 1;
-
-        *major = info->bd->major;
-        *name = info->bd->name;
-        return 0;
 }
 
+#endif /* CONFIG_PROC_FS */
 
 int register_blkdev(unsigned int major, const char *name)
 {
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 
 	/* temporary */
 	if (major == 0) {
@@ -183,7 +103,7 @@ int register_blkdev(unsigned int major, const char *name)
 		kfree(p);
 	}
 out:
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 	return ret;
 }
 
@@ -197,7 +117,7 @@ int unregister_blkdev(unsigned int major, const char *name)
 	int index = major_to_index(major);
 	int ret = 0;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 	for (n = &major_names[index]; *n; n = &(*n)->next)
 		if ((*n)->major == major)
 			break;
@@ -207,7 +127,7 @@ int unregister_blkdev(unsigned int major, const char *name)
 		p = *n;
 		*n = p->next;
 	}
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 	kfree(p);
 
 	return ret;
@@ -301,7 +221,7 @@ static void *part_start(struct seq_file *part, loff_t *pos)
 	struct list_head *p;
 	loff_t l = *pos;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 	list_for_each(p, &block_subsys.kset.list)
 		if (!l--)
 			return list_entry(p, struct gendisk, kobj.entry);
@@ -318,7 +238,7 @@ static void *part_next(struct seq_file *part, void *v, loff_t *pos)
 
 static void part_stop(struct seq_file *part, void *v)
 {
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 }
 
 static int show_partition(struct seq_file *part, void *v)
@@ -377,7 +297,7 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 
 static int __init genhd_device_init(void)
 {
-	bdev_map = kobj_map_init(base_probe, &block_subsys_sem);
+	bdev_map = kobj_map_init(base_probe, &block_subsys_lock);
 	blk_dev_init();
 	subsystem_register(&block_subsys);
 	return 0;
@@ -453,8 +373,8 @@ static ssize_t disk_stats_read(struct gendisk * disk, char *page)
 	disk_round_stats(disk);
 	preempt_enable();
 	return sprintf(page,
-		"%8u %8u %8llu %8u "
-		"%8u %8u %8llu %8u "
+		"%8lu %8lu %8llu %8u "
+		"%8lu %8lu %8llu %8u "
 		"%8u %8u %8u"
 		"\n",
 		disk_stat_read(disk, ios[READ]),
@@ -611,7 +531,7 @@ static void *diskstats_start(struct seq_file *part, loff_t *pos)
 	loff_t k = *pos;
 	struct list_head *p;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 	list_for_each(p, &block_subsys.kset.list)
 		if (!k--)
 			return list_entry(p, struct gendisk, kobj.entry);
@@ -628,7 +548,7 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
 
 static void diskstats_stop(struct seq_file *part, void *v)
 {
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 }
 
 static int diskstats_show(struct seq_file *s, void *v)
@@ -648,7 +568,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 	preempt_disable();
 	disk_round_stats(gp);
 	preempt_enable();
-	seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n",
+	seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
 		gp->major, n + gp->first_minor, disk_name(gp, n, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
 		(unsigned long long)disk_stat_read(gp, sectors[0]),
diff --git a/block/ioctl.c b/block/ioctl.c
index e1109491c23..9cfa2e1ecb2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -42,9 +43,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 					return -EINVAL;
 			}
 			/* partition number in use? */
-			down(&bdev->bd_sem);
+			mutex_lock(&bdev->bd_mutex);
 			if (disk->part[part - 1]) {
-				up(&bdev->bd_sem);
+				mutex_unlock(&bdev->bd_mutex);
 				return -EBUSY;
 			}
 			/* overlap? */
@@ -55,13 +56,13 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 					continue;
 				if (!(start+length <= s->start_sect ||
 				      start >= s->start_sect + s->nr_sects)) {
-					up(&bdev->bd_sem);
+					mutex_unlock(&bdev->bd_mutex);
 					return -EBUSY;
 				}
 			}
 			/* all seems OK */
 			add_partition(disk, part, start, length);
-			up(&bdev->bd_sem);
+			mutex_unlock(&bdev->bd_mutex);
 			return 0;
 		case BLKPG_DEL_PARTITION:
 			if (!disk->part[part-1])
@@ -71,9 +72,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			bdevp = bdget_disk(disk, part);
 			if (!bdevp)
 				return -ENOMEM;
-			down(&bdevp->bd_sem);
+			mutex_lock(&bdevp->bd_mutex);
 			if (bdevp->bd_openers) {
-				up(&bdevp->bd_sem);
+				mutex_unlock(&bdevp->bd_mutex);
 				bdput(bdevp);
 				return -EBUSY;
 			}
@@ -81,10 +82,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			fsync_bdev(bdevp);
 			invalidate_bdev(bdevp, 0);
 
-			down(&bdev->bd_sem);
+			mutex_lock(&bdev->bd_mutex);
 			delete_partition(disk, part);
-			up(&bdev->bd_sem);
-			up(&bdevp->bd_sem);
+			mutex_unlock(&bdev->bd_mutex);
+			mutex_unlock(&bdevp->bd_mutex);
 			bdput(bdevp);
 
 			return 0;
@@ -102,10 +103,10 @@ static int blkdev_reread_part(struct block_device *bdev)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (down_trylock(&bdev->bd_sem))
+	if (!mutex_trylock(&bdev->bd_mutex))
 		return -EBUSY;
 	res = rescan_partitions(disk, bdev);
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	return res;
 }
 
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 	}
 	return -ENOIOCTLCMD;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 0ef2971a9e8..5b26af8597f 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/blktrace_api.h>
 
 /*
  * for max sense size
@@ -784,6 +785,8 @@ void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
 	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
 	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
 	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
+	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+		clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
 }
 
 EXPORT_SYMBOL(blk_queue_stack_limits);
@@ -905,17 +908,15 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 				__FUNCTION__, depth);
 	}
 
-	tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+	tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
 	if (!tag_index)
 		goto fail;
 
 	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
-	tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
+	tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
 	if (!tag_map)
 		goto fail;
 
-	memset(tag_index, 0, depth * sizeof(struct request *));
-	memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
 	tags->real_max_depth = depth;
 	tags->max_depth = depth;
 	tags->tag_index = tag_index;
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
-	if (q->unplug_fn)
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
 		q->unplug_fn(q);
+	}
 }
 
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	q->unplug_fn(q);
 }
 
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
 {
 	request_queue_t *q = (request_queue_t *)data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	kblockd_schedule_work(&q->unplug_work);
 }
 
@@ -1740,16 +1753,11 @@ EXPORT_SYMBOL(blk_run_queue);
  *     Hopefully the low level driver will have finished any
  *     outstanding requests first...
  **/
-void blk_cleanup_queue(request_queue_t * q)
+static void blk_release_queue(struct kobject *kobj)
 {
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
 	struct request_list *rl = &q->rq;
 
-	if (!atomic_dec_and_test(&q->refcnt))
-		return;
-
-	if (q->elevator)
-		elevator_exit(q->elevator);
-
 	blk_sync_queue(q);
 
 	if (rl->rq_pool)
@@ -1758,9 +1766,30 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
+void blk_put_queue(request_queue_t *q)
+{
+	kobject_put(&q->kobj);
+}
+EXPORT_SYMBOL(blk_put_queue);
+
+void blk_cleanup_queue(request_queue_t * q)
+{
+	mutex_lock(&q->sysfs_lock);
+	set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
+	mutex_unlock(&q->sysfs_lock);
+
+	if (q->elevator)
+		elevator_exit(q->elevator);
+
+	blk_put_queue(q);
+}
+
 EXPORT_SYMBOL(blk_cleanup_queue);
 
 static int blk_init_free_list(request_queue_t *q)
@@ -1788,6 +1817,8 @@ request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
+static struct kobj_type queue_ktype;
+
 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	request_queue_t *q;
@@ -1798,11 +1829,16 @@ request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
-	atomic_set(&q->refcnt, 1);
+
+	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
+	q->kobj.ktype = &queue_ktype;
+	kobject_init(&q->kobj);
 
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 
+	mutex_init(&q->sysfs_lock);
+
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -1854,8 +1890,10 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 		return NULL;
 
 	q->node = node_id;
-	if (blk_init_free_list(q))
-		goto out_init;
+	if (blk_init_free_list(q)) {
+		kmem_cache_free(requestq_cachep, q);
+		return NULL;
+	}
 
 	/*
 	 * if caller didn't supply a lock, they get per-queue locking with
@@ -1891,9 +1929,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 		return q;
 	}
 
-	blk_cleanup_queue(q);
-out_init:
-	kmem_cache_free(requestq_cachep, q);
+	blk_put_queue(q);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
@@ -1901,7 +1937,7 @@ EXPORT_SYMBOL(blk_init_queue_node);
 int blk_get_queue(request_queue_t *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		atomic_inc(&q->refcnt);
+		kobject_get(&q->kobj);
 		return 0;
 	}
 
@@ -2109,6 +2145,8 @@ rq_starved:
 	
 	rq_init(q, rq);
 	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
@@ -2137,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
@@ -2190,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
@@ -2437,10 +2479,12 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
 	rq->rq_disk = bd_disk;
 	rq->flags |= REQ_NOMERGE;
 	rq->end_io = done;
-	elv_add_request(q, rq, where, 1);
-	generic_unplug_device(q);
+	WARN_ON(irqs_disabled());
+	spin_lock_irq(q->queue_lock);
+	__elv_add_request(q, rq, where, 1);
+	__generic_unplug_device(q);
+	spin_unlock_irq(q->queue_lock);
 }
-
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
 
 /**
@@ -2824,6 +2868,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->back_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2839,6 +2885,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->front_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
 			bio->bi_next = req->bio;
 			req->bio = bio;
 
@@ -2956,6 +3004,7 @@ void generic_make_request(struct bio *bio)
 	request_queue_t *q;
 	sector_t maxsector;
 	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
 
 	might_sleep();
 	/* Test device or partition size, when known. */
@@ -2982,6 +3031,8 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
+	maxsector = -1;
+	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 
@@ -3014,6 +3065,15 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
@@ -3133,6 +3193,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
@@ -3452,7 +3514,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
@@ -3477,10 +3539,18 @@ void put_io_context(struct io_context *ioc)
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
+		struct cfq_io_context *cic;
+
+		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		if (ioc->cic && ioc->cic->dtor)
-			ioc->cic->dtor(ioc->cic);
+		if (ioc->cic_root.rb_node != NULL) {
+			struct rb_node *n = rb_first(&ioc->cic_root);
+
+			cic = rb_entry(n, struct cfq_io_context, rb_node);
+			cic->dtor(ioc);
+		}
+		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
@@ -3492,6 +3562,7 @@ void exit_io_context(void)
 {
 	unsigned long flags;
 	struct io_context *ioc;
+	struct cfq_io_context *cic;
 
 	local_irq_save(flags);
 	task_lock(current);
@@ -3503,9 +3574,11 @@ void exit_io_context(void)
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
-	if (ioc->cic && ioc->cic->exit)
-		ioc->cic->exit(ioc->cic);
-
+	if (ioc->cic_root.rb_node != NULL) {
+		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+		cic->exit(ioc);
+	}
+ 
 	put_io_context(ioc);
 }
 
@@ -3534,7 +3607,7 @@ struct io_context *current_io_context(gfp_t gfp_flags)
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
-		ret->cic = NULL;
+		ret->cic_root.rb_node = NULL;
 		tsk->io_context = ret;
 	}
 
@@ -3614,10 +3687,13 @@ static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
 	struct request_list *rl = &q->rq;
+	unsigned long nr;
+	int ret = queue_var_store(&nr, page, count);
+	if (nr < BLKDEV_MIN_RQ)
+		nr = BLKDEV_MIN_RQ;
 
-	int ret = queue_var_store(&q->nr_requests, page, count);
-	if (q->nr_requests < BLKDEV_MIN_RQ)
-		q->nr_requests = BLKDEV_MIN_RQ;
+	spin_lock_irq(q->queue_lock);
+	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
 
 	if (rl->count[READ] >= queue_congestion_on_threshold(q))
@@ -3643,6 +3719,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 		blk_clear_queue_full(q, WRITE);
 		wake_up(&rl->wait[WRITE]);
 	}
+	spin_unlock_irq(q->queue_lock);
 	return ret;
 }
 
@@ -3758,13 +3835,19 @@ static ssize_t
 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
-	struct request_queue *q;
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
+	ssize_t res;
 
-	q = container_of(kobj, struct request_queue, kobj);
 	if (!entry->show)
 		return -EIO;
-
-	return entry->show(q, page);
+	mutex_lock(&q->sysfs_lock);
+	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->show(q, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
 }
 
 static ssize_t
@@ -3772,13 +3855,20 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 		    const char *page, size_t length)
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
-	struct request_queue *q;
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
+
+	ssize_t res;
 
-	q = container_of(kobj, struct request_queue, kobj);
 	if (!entry->store)
 		return -EIO;
-
-	return entry->store(q, page, length);
+	mutex_lock(&q->sysfs_lock);
+	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->store(q, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
 }
 
 static struct sysfs_ops queue_sysfs_ops = {
@@ -3789,6 +3879,7 @@ static struct sysfs_ops queue_sysfs_ops = {
 static struct kobj_type queue_ktype = {
 	.sysfs_ops	= &queue_sysfs_ops,
 	.default_attrs	= default_attrs,
+	.release	= blk_release_queue,
 };
 
 int blk_register_queue(struct gendisk *disk)
@@ -3801,19 +3892,17 @@ int blk_register_queue(struct gendisk *disk)
 		return -ENXIO;
 
 	q->kobj.parent = kobject_get(&disk->kobj);
-	if (!q->kobj.parent)
-		return -EBUSY;
 
-	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
-	q->kobj.ktype = &queue_ktype;
-
-	ret = kobject_register(&q->kobj);
+	ret = kobject_add(&q->kobj);
 	if (ret < 0)
 		return ret;
 
+	kobject_uevent(&q->kobj, KOBJ_ADD);
+
 	ret = elv_register_queue(q);
 	if (ret) {
-		kobject_unregister(&q->kobj);
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
 		return ret;
 	}
 
@@ -3827,7 +3916,8 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (q && q->request_fn) {
 		elv_unregister_queue(q);
 
-		kobject_unregister(&q->kobj);
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
 		kobject_put(&disk->kobj);
 	}
 }