30 files changed, 573 insertions, 595 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 98d9ec85e0e..8420129fc5e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -327,4 +327,10 @@ config DM_UEVENT
 	---help---
 	Generate udev events for DM events.
 
+config DM_FLAKEY
+       tristate "Flakey target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+         A target that intermittently fails I/O for debugging purposes.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d0138606c2e..448838b1f92 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
+obj-$(CONFIG_DM_FLAKEY)		+= dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a35320fb59..5c9362792f1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 			atomic_inc(&bitmap->pending_writes);
 			set_buffer_locked(bh);
 			set_buffer_mapped(bh);
-			submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh);
+			submit_bh(WRITE | REQ_SYNC, bh);
 			bh = bh->b_this_page;
 		}
 
@@ -854,7 +854,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
 			set_bit(bit, kaddr);
 		else
-			ext2_set_bit(bit, kaddr);
+			__test_and_set_bit_le(bit, kaddr);
 		kunmap_atomic(kaddr, KM_USER0);
 		PRINTK("set file bit %lu page %lu\n", bit, page->index);
 	}
@@ -1050,7 +1050,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
 			b = test_bit(bit, paddr);
 		else
-			b = ext2_test_bit(bit, paddr);
+			b = test_bit_le(bit, paddr);
 		kunmap_atomic(paddr, KM_USER0);
 		if (b) {
 			/* if the disk bit is set, set the memory bit */
@@ -1226,7 +1226,7 @@ void bitmap_daemon_work(mddev_t *mddev)
 						clear_bit(file_page_offset(bitmap, j),
 							  paddr);
 					else
-						ext2_clear_bit(file_page_offset(bitmap, j),
+						__test_and_clear_bit_le(file_page_offset(bitmap, j),
 							       paddr);
 					kunmap_atomic(paddr, KM_USER0);
 				} else
@@ -1339,8 +1339,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&bitmap->lock);
-			md_unplug(bitmap->mddev);
-			schedule();
+			io_schedule();
 			finish_wait(&bitmap->overflow_wait, &__wait);
 			continue;
 		}
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 931a7a7c379..d0aeaf46d93 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -45,7 +45,7 @@
  *
  * The counter counts pending write requests, plus the on-disk bit.
  * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
+ * bit can be cleared as well, thus setting the counter to 0.
  * When we set a bit, or in the counter (to start a write), if the fields is
  * 0, we first set the disk bit and set the counter to 1.
  *
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e054bd9166..c8827ffd85b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -991,11 +991,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void kcryptd_unplug(struct crypt_config *cc)
-{
-	blk_unplug(bdev_get_queue(cc->dev->bdev));
-}
-
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->target->private;
@@ -1008,10 +1003,8 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
 	clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
-	if (!clone) {
-		kcryptd_unplug(cc);
+	if (!clone)
 		return 1;
-	}
 
 	crypt_inc_pending(io);
 
@@ -1331,20 +1324,29 @@ static int crypt_setkey_allcpus(struct crypt_config *cc)
 
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
+	int r = -EINVAL;
+	int key_string_len = strlen(key);
+
 	/* The key size may not be changed. */
-	if (cc->key_size != (strlen(key) >> 1))
-		return -EINVAL;
+	if (cc->key_size != (key_string_len >> 1))
+		goto out;
 
 	/* Hyphen (which gives a key_size of zero) means there is no key. */
 	if (!cc->key_size && strcmp(key, "-"))
-		return -EINVAL;
+		goto out;
 
 	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-		return -EINVAL;
+		goto out;
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return crypt_setkey_allcpus(cc);
+	r = crypt_setkey_allcpus(cc);
+
+out:
+	/* Hex key string not needed after here, so wipe it. */
+	memset(key, '0', key_string_len);
+
+	return r;
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
new file mode 100644
index 00000000000..ea790623c30
--- /dev/null
+++ b/drivers/md/dm-flakey.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "flakey"
+
+/*
+ * Flakey: Used for testing only, simulates intermittent,
+ * catastrophic device failure.
+ */
+struct flakey_c {
+	struct dm_dev *dev;
+	unsigned long start_time;
+	sector_t start;
+	unsigned up_interval;
+	unsigned down_interval;
+};
+
+/*
+ * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ */
+static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct flakey_c *fc;
+	unsigned long long tmp;
+
+	if (argc != 4) {
+		ti->error = "dm-flakey: Invalid argument count";
+		return -EINVAL;
+	}
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	if (!fc) {
+		ti->error = "dm-flakey: Cannot allocate linear context";
+		return -ENOMEM;
+	}
+	fc->start_time = jiffies;
+
+	if (sscanf(argv[1], "%llu", &tmp) != 1) {
+		ti->error = "dm-flakey: Invalid device sector";
+		goto bad;
+	}
+	fc->start = tmp;
+
+	if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
+		ti->error = "dm-flakey: Invalid up interval";
+		goto bad;
+	}
+
+	if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
+		ti->error = "dm-flakey: Invalid down interval";
+		goto bad;
+	}
+
+	if (!(fc->up_interval + fc->down_interval)) {
+		ti->error = "dm-flakey: Total (up + down) interval is zero";
+		goto bad;
+	}
+
+	if (fc->up_interval + fc->down_interval < fc->up_interval) {
+		ti->error = "dm-flakey: Interval overflow";
+		goto bad;
+	}
+
+	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
+		ti->error = "dm-flakey: Device lookup failed";
+		goto bad;
+	}
+
+	ti->num_flush_requests = 1;
+	ti->private = fc;
+	return 0;
+
+bad:
+	kfree(fc);
+	return -EINVAL;
+}
+
+static void flakey_dtr(struct dm_target *ti)
+{
+	struct flakey_c *fc = ti->private;
+
+	dm_put_device(ti, fc->dev);
+	kfree(fc);
+}
+
+static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+	struct flakey_c *fc = ti->private;
+
+	return fc->start + (bi_sector - ti->begin);
+}
+
+static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct flakey_c *fc = ti->private;
+
+	bio->bi_bdev = fc->dev->bdev;
+	if (bio_sectors(bio))
+		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+}
+
+static int flakey_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct flakey_c *fc = ti->private;
+	unsigned elapsed;
+
+	/* Are we alive ? */
+	elapsed = (jiffies - fc->start_time) / HZ;
+	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+		return -EIO;
+
+	flakey_map_bio(ti, bio);
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static int flakey_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	struct flakey_c *fc = ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
+			 (unsigned long long)fc->start, fc->up_interval,
+			 fc->down_interval);
+		break;
+	}
+	return 0;
+}
+
+static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+{
+	struct flakey_c *fc = ti->private;
+
+	return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
+}
+
+static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size)
+{
+	struct flakey_c *fc = ti->private;
+	struct request_queue *q = bdev_get_queue(fc->dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = fc->dev->bdev;
+	bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+	struct flakey_c *fc = ti->private;
+
+	return fn(ti, fc->dev, fc->start, ti->len, data);
+}
+
+static struct target_type flakey_target = {
+	.name   = "flakey",
+	.version = {1, 1, 0},
+	.module = THIS_MODULE,
+	.ctr    = flakey_ctr,
+	.dtr    = flakey_dtr,
+	.map    = flakey_map,
+	.status = flakey_status,
+	.ioctl	= flakey_ioctl,
+	.merge	= flakey_merge,
+	.iterate_devices = flakey_iterate_devices,
+};
+
+static int __init dm_flakey_init(void)
+{
+	int r = dm_register_target(&flakey_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_flakey_exit(void)
+{
+	dm_unregister_target(&flakey_target);
+}
+
+/* Module hooks */
+module_init(dm_flakey_init);
+module_exit(dm_flakey_exit);
+
+MODULE_DESCRIPTION(DM_NAME " flakey target");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 136d4f71a11..76a5af00a26 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	BUG_ON(num_regions > DM_IO_MAX_REGIONS);
 
 	if (sync)
-		rw |= REQ_SYNC | REQ_UNPLUG;
+		rw |= REQ_SYNC;
 
 	/*
 	 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 6d12775a106..4cacdad2270 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1501,14 +1501,10 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 	return r;
 }
 
-static void free_params(struct dm_ioctl *param)
-{
-	vfree(param);
-}
-
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 {
 	struct dm_ioctl tmp, *dmi;
+	int secure_data;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
 		return -EFAULT;
@@ -1516,17 +1512,30 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 	if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
 		return -EINVAL;
 
+	secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
+
 	dmi = vmalloc(tmp.data_size);
-	if (!dmi)
+	if (!dmi) {
+		if (secure_data && clear_user(user, tmp.data_size))
+			return -EFAULT;
 		return -ENOMEM;
-
-	if (copy_from_user(dmi, user, tmp.data_size)) {
-		vfree(dmi);
-		return -EFAULT;
 	}
 
+	if (copy_from_user(dmi, user, tmp.data_size))
+		goto bad;
+
+	/* Wipe the user buffer so we do not return it to userspace */
+	if (secure_data && clear_user(user, tmp.data_size))
+		goto bad;
+
 	*param = dmi;
 	return 0;
+
+bad:
+	if (secure_data)
+		memset(dmi, 0, tmp.data_size);
+	vfree(dmi);
+	return -EFAULT;
 }
 
 static int validate_params(uint cmd, struct dm_ioctl *param)
@@ -1534,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	/* Always clear this flag */
 	param->flags &= ~DM_BUFFER_FULL_FLAG;
 	param->flags &= ~DM_UEVENT_GENERATED_FLAG;
+	param->flags &= ~DM_SECURE_DATA_FLAG;
 
 	/* Ignores parameters */
 	if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1561,10 +1571,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
+	int wipe_buffer;
 	unsigned int cmd;
 	struct dm_ioctl *uninitialized_var(param);
 	ioctl_fn fn = NULL;
-	size_t param_size;
+	size_t input_param_size;
 
 	/* only root can play with this */
 	if (!capable(CAP_SYS_ADMIN))
@@ -1611,13 +1622,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (r)
 		return r;
 
+	input_param_size = param->data_size;
+	wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
+
 	r = validate_params(cmd, param);
 	if (r)
 		goto out;
 
-	param_size = param->data_size;
 	param->data_size = sizeof(*param);
-	r = fn(param, param_size);
+	r = fn(param, input_param_size);
 
 	/*
 	 * Copy the results back to userland.
@@ -1625,8 +1638,11 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (!r && copy_to_user(user, param, param->data_size))
 		r = -EFAULT;
 
- out:
-	free_params(param);
+out:
+	if (wipe_buffer)
+		memset(param, 0, input_param_size);
+
+	vfree(param);
 	return r;
 }
 
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 924f5f0084c..1bb73a13ca4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,13 +37,6 @@ struct dm_kcopyd_client {
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
 
-	/*
-	 * Block devices to unplug.
-	 * Non-NULL pointer means that a block device has some pending requests
-	 * and needs to be unplugged.
-	 */
-	struct block_device *unplug[2];
-
 	struct dm_io_client *io_client;
 
 	wait_queue_head_t destroyq;
@@ -315,31 +308,6 @@ static int run_complete_job(struct kcopyd_job *job)
 	return 0;
 }
 
-/*
- * Unplug the block device at the specified index.
- */
-static void unplug(struct dm_kcopyd_client *kc, int rw)
-{
-	if (kc->unplug[rw] != NULL) {
-		blk_unplug(bdev_get_queue(kc->unplug[rw]));
-		kc->unplug[rw] = NULL;
-	}
-}
-
-/*
- * Prepare block device unplug. If there's another device
- * to be unplugged at the same array index, we unplug that
- * device first.
- */
-static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
-			   struct block_device *bdev)
-{
-	if (likely(kc->unplug[rw] == bdev))
-		return;
-	unplug(kc, rw);
-	kc->unplug[rw] = bdev;
-}
-
 static void complete_io(unsigned long error, void *context)
 {
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -386,16 +354,10 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
-	if (job->rw == READ) {
+	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
-		prepare_unplug(job->kc, READ, job->source.bdev);
-	} else {
-		if (job->num_dests > 1)
-			io_req.bi_rw |= REQ_UNPLUG;
+	else
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
-		if (!(io_req.bi_rw & REQ_UNPLUG))
-			prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
-	}
 
 	return r;
 }
@@ -466,6 +428,7 @@ static void do_work(struct work_struct *work)
 {
 	struct dm_kcopyd_client *kc = container_of(work,
 					struct dm_kcopyd_client, kcopyd_work);
+	struct blk_plug plug;
 
 	/*
 	 * The order that these are called is *very* important.
@@ -473,18 +436,12 @@ static void do_work(struct work_struct *work)
 	 * Pages jobs when successful will jump onto the io jobs
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
-	 *
-	 * Note that io_jobs add block devices to the unplug array,
-	 * this array is cleared with "unplug" calls. It is thus
-	 * forbidden to run complete_jobs after io_jobs and before
-	 * unplug because the block device could be destroyed in
-	 * job completion callback.
 	 */
+	blk_start_plug(&plug);
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
 	process_jobs(&kc->io_jobs, kc, run_io_job);
-	unplug(kc, READ);
-	unplug(kc, WRITE);
+	blk_finish_plug(&plug);
 }
 
 /*
@@ -665,8 +622,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
 
-	memset(kc->unplug, 0, sizeof(kc->unplug));
-
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6951536ea29..a1f32188967 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -251,20 +251,20 @@ struct log_c {
  */
 static inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-	return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+	return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
 }
 
 static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
-	ext2_set_bit(bit, (unsigned long *) bs);
+	__test_and_set_bit_le(bit, (unsigned long *) bs);
 	l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
-	ext2_clear_bit(bit, (unsigned long *) bs);
+	__test_and_clear_bit_le(bit, (unsigned long *) bs);
 	l->touched_dirtied = 1;
 }
 
@@ -543,7 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev);
+	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
 	if (r)
 		return r;
 
@@ -740,7 +740,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
 		return 0;
 
 	do {
-		*region = ext2_find_next_zero_bit(
+		*region = find_next_zero_bit_le(
 					     (unsigned long *) lc->sync_bits,
 					     lc->region_count,
 					     lc->sync_search);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 4b0b63c290a..a550a057d99 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -844,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 {
 	/* target parameters */
 	static struct param _params[] = {
-		{1, 1024, "invalid number of priority groups"},
-		{1, 1024, "invalid initial priority group number"},
+		{0, 1024, "invalid number of priority groups"},
+		{0, 1024, "invalid initial priority group number"},
 	};
 
 	int r;
@@ -879,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	if (r)
 		goto bad;
 
+	if ((!m->nr_priority_groups && next_pg_num) ||
+	    (m->nr_priority_groups && !next_pg_num)) {
+		ti->error = "invalid initial priority group";
+		r = -EINVAL;
+		goto bad;
+	}
+
 	/* parse the priority groups */
 	while (as.argc) {
 		struct priority_group *pg;
@@ -1065,7 +1072,7 @@ out:
 static int action_dev(struct multipath *m, struct dm_dev *dev,
 		      action_fn action)
 {
-	int r = 0;
+	int r = -EINVAL;
 	struct pgpath *pgpath;
 	struct priority_group *pg;
 
@@ -1415,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 	else if (m->current_pg)
 		pg_num = m->current_pg->pg_num;
 	else
-			pg_num = 1;
+		pg_num = (m->nr_priority_groups ? 1 : 0);
 
 	DMEMIT("%u ", pg_num);
 
@@ -1669,7 +1676,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b9e1e15ef11..e5d8904fc8f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -390,13 +390,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return md_raid5_congested(&rs->md, bits);
 }
 
-static void raid_unplug(struct dm_target_callbacks *cb)
-{
-	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
-
-	md_raid5_unplug_device(rs->md.private);
-}
-
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -487,7 +480,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	rs->callbacks.congested_fn = raid_is_congested;
-	rs->callbacks.unplug_fn = raid_unplug;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
 	return 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index dee326775c6..976ad4688af 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -842,8 +842,6 @@ static void do_mirror(struct work_struct *work)
 	do_reads(ms, &reads);
 	do_writes(ms, &writes);
 	do_failures(ms, &failures);
-
-	dm_table_unplug_all(ms->ti->table);
 }
 
 /*-----------------------------------------------------------------
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index dad011aed0c..7771ed21218 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 	/*
 	 * Possible cases:
 	 *   1) DM_RH_DIRTY
-	 *   2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   2) DM_RH_NOSYNC: was dirty, other preceding writes failed
 	 *   3) DM_RH_RECOVERING: flushing pending writes
 	 * Either case, the region should have not been connected to list.
 	 */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index fdde53cd12b..a2d330942cb 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1080,7 +1080,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	argv++;
 	argc--;
 
-	r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow);
+	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
 	if (r) {
 		ti->error = "Cannot get COW device";
 		goto bad_cow;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index dddfa14f298..3d80cf0c152 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -396,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti,
 	blk_limits_io_opt(limits, chunk_size * sc->stripes);
 }
 
+static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size)
+{
+	struct stripe_c *sc = ti->private;
+	sector_t bvm_sector = bvm->bi_sector;
+	uint32_t stripe;
+	struct request_queue *q;
+
+	stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
+
+	q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
+	bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 3, 1},
+	.version = {1, 4, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
@@ -407,6 +427,7 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
+	.merge  = stripe_merge,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 38e4eb1bb96..cb8380c9767 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
 	struct dm_target *targets;
 
 	unsigned discards_supported:1;
+	unsigned integrity_supported:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -859,7 +860,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
 		return -EINVAL;
 	}
 
-	t->mempools = dm_alloc_md_mempools(type);
+	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
 	if (!t->mempools)
 		return -ENOMEM;
 
@@ -926,18 +927,80 @@ static int dm_table_build_index(struct dm_table *t)
 }
 
 /*
+ * Get a disk whose integrity profile reflects the table's profile.
+ * If %match_all is true, all devices' profiles must match.
+ * If %match_all is false, all devices must at least have an
+ * allocated integrity profile; but uninitialized is ok.
+ * Returns NULL if integrity support was inconsistent or unavailable.
+ */
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
+						    bool match_all)
+{
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd = NULL;
+	struct gendisk *prev_disk = NULL, *template_disk = NULL;
+
+	list_for_each_entry(dd, devices, list) {
+		template_disk = dd->dm_dev.bdev->bd_disk;
+		if (!blk_get_integrity(template_disk))
+			goto no_integrity;
+		if (!match_all && !blk_integrity_is_initialized(template_disk))
+			continue; /* skip uninitialized profiles */
+		else if (prev_disk &&
+			 blk_integrity_compare(prev_disk, template_disk) < 0)
+			goto no_integrity;
+		prev_disk = template_disk;
+	}
+
+	return template_disk;
+
+no_integrity:
+	if (prev_disk)
+		DMWARN("%s: integrity not set: %s and %s profile mismatch",
+		       dm_device_name(t->md),
+		       prev_disk->disk_name,
+		       template_disk->disk_name);
+	return NULL;
+}
+
+/*
  * Register the mapped device for blk_integrity support if
- * the underlying devices support it.
+ * the underlying devices have an integrity profile.  But all devices
+ * may not have matching profiles (checking all devices isn't reliable
+ * during table load because this table may use other DM device(s) which
+ * must be resumed before they will have an initialized integity profile).
+ * Stacked DM devices force a 2 stage integrity profile validation:
+ * 1 - during load, validate all initialized integrity profiles match
+ * 2 - during resume, validate all integrity profiles match
  */
 static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
 {
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_dev_internal *dd;
+	struct gendisk *template_disk = NULL;
 
-	list_for_each_entry(dd, devices, list)
-		if (bdev_get_integrity(dd->dm_dev.bdev))
-			return blk_integrity_register(dm_disk(md), NULL);
+	template_disk = dm_table_get_integrity_disk(t, false);
+	if (!template_disk)
+		return 0;
 
+	if (!blk_integrity_is_initialized(dm_disk(md))) {
+		t->integrity_supported = 1;
+		return blk_integrity_register(dm_disk(md), NULL);
+	}
+
+	/*
+	 * If DM device already has an initalized integrity
+	 * profile the new profile should not conflict.
+	 */
+	if (blk_integrity_is_initialized(template_disk) &&
+	    blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+		DMWARN("%s: conflict with existing integrity profile: "
+		       "%s profile mismatch",
+		       dm_device_name(t->md),
+		       template_disk->disk_name);
+		return 1;
+	}
+
+	/* Preserve existing initialized integrity profile */
+	t->integrity_supported = 1;
 	return 0;
 }
 
@@ -1091,41 +1154,27 @@ combine_limits:
 
 /*
  * Set the integrity profile for this device if all devices used have
- * matching profiles.
+ * matching profiles.  We're quite deep in the resume path but still
+ * don't know if all devices (particularly DM devices this device
+ * may be stacked on) have matching profiles.  Even if the profiles
+ * don't match we have no way to fail (to resume) at this point.
  */
 static void dm_table_set_integrity(struct dm_table *t)
 {
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_dev_internal *prev = NULL, *dd = NULL;
+	struct gendisk *template_disk = NULL;
 
 	if (!blk_get_integrity(dm_disk(t->md)))
 		return;
 
-	list_for_each_entry(dd, devices, list) {
-		if (prev &&
-		    blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
-					  dd->dm_dev.bdev->bd_disk) < 0) {
-			DMWARN("%s: integrity not set: %s and %s mismatch",
-			       dm_device_name(t->md),
-			       prev->dm_dev.bdev->bd_disk->disk_name,
-			       dd->dm_dev.bdev->bd_disk->disk_name);
-			goto no_integrity;
-		}
-		prev = dd;
+	template_disk = dm_table_get_integrity_disk(t, true);
+	if (!template_disk &&
+	    blk_integrity_is_initialized(dm_disk(t->md))) {
+		DMWARN("%s: device no longer has a valid integrity profile",
+		       dm_device_name(t->md));
+		return;
 	}
-
-	if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
-		goto no_integrity;
-
 	blk_integrity_register(dm_disk(t->md),
-			       bdev_get_integrity(prev->dm_dev.bdev));
-
-	return;
-
-no_integrity:
-	blk_integrity_register(dm_disk(t->md), NULL);
-
-	return;
+			       blk_get_integrity(template_disk));
 }
 
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1275,29 +1324,6 @@ int dm_table_any_busy_target(struct dm_table *t)
 	return 0;
 }
 
-void dm_table_unplug_all(struct dm_table *t)
-{
-	struct dm_dev_internal *dd;
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_target_callbacks *cb;
-
-	list_for_each_entry(dd, devices, list) {
-		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
-		char b[BDEVNAME_SIZE];
-
-		if (likely(q))
-			blk_unplug(q);
-		else
-			DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
-				     dm_device_name(t->md),
-				     bdevname(dd->dm_dev.bdev, b));
-	}
-
-	list_for_each_entry(cb, &t->target_callbacks, list)
-		if (cb->unplug_fn)
-			cb->unplug_fn(cb);
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
@@ -1345,4 +1371,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_get_md);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
-EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eaa3af0e063..0cf68b47887 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -477,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
+	atomic_set(&dm_disk(md)->part0.in_flight[rw],
+		atomic_inc_return(&md->pending[rw]));
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -497,8 +498,8 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
 	 */
-	dm_disk(md)->part0.in_flight[rw] = pending =
-		atomic_dec_return(&md->pending[rw]);
+	pending = atomic_dec_return(&md->pending[rw]);
+	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
@@ -807,8 +808,6 @@ void dm_requeue_unmapped_request(struct request *clone)
 	dm_unprep_request(rq);
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (elv_queue_empty(q))
-		blk_plug_device(q);
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
@@ -1613,10 +1612,10 @@ static void dm_request_fn(struct request_queue *q)
 	 * number of in-flight I/Os after the queue is stopped in
 	 * dm_suspend().
 	 */
-	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
-			goto plug_and_out;
+			goto delay_and_out;
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
@@ -1627,7 +1626,7 @@ static void dm_request_fn(struct request_queue *q)
 		BUG_ON(!dm_target_is_valid(ti));
 
 		if (ti->type->busy && ti->type->busy(ti))
-			goto plug_and_out;
+			goto delay_and_out;
 
 		blk_start_request(rq);
 		clone = rq->special;
@@ -1647,11 +1646,8 @@ requeued:
 	BUG_ON(!irqs_disabled());
 	spin_lock(q->queue_lock);
 
-plug_and_out:
-	if (!elv_queue_empty(q))
-		/* Some requests still remain, retry later */
-		blk_plug_device(q);
-
+delay_and_out:
+	blk_delay_queue(q, HZ / 10);
 out:
 	dm_table_put(map);
 
@@ -1680,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q)
 	return r;
 }
 
-static void dm_unplug_all(struct request_queue *q)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
-
-	if (map) {
-		if (dm_request_based(md))
-			generic_unplug_device(q);
-
-		dm_table_unplug_all(map);
-		dm_table_put(map);
-	}
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
@@ -1817,7 +1799,6 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
@@ -2263,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
-	dm_unplug_all(md->queue);
-
 	add_wait_queue(&md->wait, &wait);
 
 	while (1) {
@@ -2539,7 +2518,6 @@ int dm_resume(struct mapped_device *md)
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
-	dm_table_unplug_all(map);
 	r = 0;
 out:
 	dm_table_put(map);
@@ -2643,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 {
 	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
 
 	if (!pools)
 		return NULL;
@@ -2662,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
 	if (!pools->tio_pool)
 		goto free_io_pool_and_out;
 
-	pools->bs = (type == DM_TYPE_BIO_BASED) ?
-		    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+	pools->bs = bioset_create(pool_size, 0);
 	if (!pools->bs)
 		goto free_tio_pool_and_out;
 
+	if (integrity && bioset_integrity_create(pools->bs, pool_size))
+		goto free_bioset_and_out;
+
 	return pools;
 
+free_bioset_and_out:
+	bioset_free(pools->bs);
+
 free_tio_pool_and_out:
 	mempool_destroy(pools->tio_pool);
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af7..1aaf16746da 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 #endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 339fdc67075..23078dabb6d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -30,7 +30,7 @@
  *
  * Different modes can be active at a time, but only
  * one can be set at array creation.  Others can be added later.
- * A mode can be one-shot or recurrent with the recurrance being
+ * A mode can be one-shot or recurrent with the recurrence being
  * once in every N requests.
  * The bottom 5 bits of the "layout" indicate the mode.  The
  * remainder indicate a period, or 0 for one-shot.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 0ed7f6bc2a7..abfb59a61ed 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	return maxsectors << 9;
 }
 
-static void linear_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf;
-	int i;
-
-	rcu_read_lock();
-	conf = rcu_dereference(mddev->private);
-
-	for (i=0; i < mddev->raid_disks; i++) {
-		struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
-		blk_unplug(r_queue);
-	}
-	rcu_read_unlock();
-}
-
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -224,11 +208,9 @@ static int linear_run (mddev_t *mddev)
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
-	mddev->queue->unplug_fn = linear_unplug;
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static void free_conf(struct rcu_head *head)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d5ad7723b17..7d6f7f18a92 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -447,48 +447,59 @@ EXPORT_SYMBOL(md_flush_request);
 
 /* Support for plugging.
  * This mirrors the plugging support in request_queue, but does not
- * require having a whole queue
+ * require having a whole queue or request structures.
+ * We allocate an md_plug_cb for each md device and each thread it gets
+ * plugged on.  This links tot the private plug_handle structure in the
+ * personality data where we keep a count of the number of outstanding
+ * plugs so other code can see if a plug is active.
  */
-static void plugger_work(struct work_struct *work)
-{
-	struct plug_handle *plug =
-		container_of(work, struct plug_handle, unplug_work);
-	plug->unplug_fn(plug);
-}
-static void plugger_timeout(unsigned long data)
-{
-	struct plug_handle *plug = (void *)data;
-	kblockd_schedule_work(NULL, &plug->unplug_work);
-}
-void plugger_init(struct plug_handle *plug,
-		  void (*unplug_fn)(struct plug_handle *))
-{
-	plug->unplug_flag = 0;
-	plug->unplug_fn = unplug_fn;
-	init_timer(&plug->unplug_timer);
-	plug->unplug_timer.function = plugger_timeout;
-	plug->unplug_timer.data = (unsigned long)plug;
-	INIT_WORK(&plug->unplug_work, plugger_work);
-}
-EXPORT_SYMBOL_GPL(plugger_init);
+struct md_plug_cb {
+	struct blk_plug_cb cb;
+	mddev_t *mddev;
+};
 
-void plugger_set_plug(struct plug_handle *plug)
+static void plugger_unplug(struct blk_plug_cb *cb)
 {
-	if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag))
-		mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1);
+	struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
+	if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
+		md_wakeup_thread(mdcb->mddev->thread);
+	kfree(mdcb);
 }
-EXPORT_SYMBOL_GPL(plugger_set_plug);
 
-int plugger_remove_plug(struct plug_handle *plug)
+/* Check that an unplug wakeup will come shortly.
+ * If not, wakeup the md thread immediately
+ */
+int mddev_check_plugged(mddev_t *mddev)
 {
-	if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) {
-		del_timer(&plug->unplug_timer);
-		return 1;
-	} else
+	struct blk_plug *plug = current->plug;
+	struct md_plug_cb *mdcb;
+
+	if (!plug)
+		return 0;
+
+	list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
+		if (mdcb->cb.callback == plugger_unplug &&
+		    mdcb->mddev == mddev) {
+			/* Already on the list, move to top */
+			if (mdcb != list_first_entry(&plug->cb_list,
+						    struct md_plug_cb,
+						    cb.list))
+				list_move(&mdcb->cb.list, &plug->cb_list);
+			return 1;
+		}
+	}
+	/* Not currently on the callback list */
+	mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
+	if (!mdcb)
 		return 0;
-}
-EXPORT_SYMBOL_GPL(plugger_remove_plug);
 
+	mdcb->mddev = mddev;
+	mdcb->cb.callback = plugger_unplug;
+	atomic_inc(&mddev->plug_cnt);
+	list_add(&mdcb->cb.list, &plug->cb_list);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(mddev_check_plugged);
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -538,6 +549,7 @@ void mddev_init(mddev_t *mddev)
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
+	atomic_set(&mddev->plug_cnt, 0);
 	spin_lock_init(&mddev->write_lock);
 	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
@@ -780,8 +792,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	bio->bi_end_io = super_written;
 
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
-		   bio);
+	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
 }
 
 void md_super_wait(mddev_t *mddev)
@@ -809,7 +820,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= REQ_SYNC | REQ_UNPLUG;
+	rw |= REQ_SYNC;
 
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
@@ -1778,12 +1789,6 @@ int md_integrity_register(mddev_t *mddev)
 			continue;
 		if (rdev->raid_disk < 0)
 			continue;
-		/*
-		 * If at least one rdev is not integrity capable, we can not
-		 * enable data integrity for the md device.
-		 */
-		if (!bdev_get_integrity(rdev->bdev))
-			return -EINVAL;
 		if (!reference) {
 			/* Use the first rdev as the reference */
 			reference = rdev;
@@ -1794,6 +1799,8 @@ int md_integrity_register(mddev_t *mddev)
 				rdev->bdev->bd_disk) < 0)
 			return -EINVAL;
 	}
+	if (!reference || !bdev_get_integrity(reference->bdev))
+		return 0;
 	/*
 	 * All component devices are integrity capable and have matching
 	 * profiles, register the common profile for the md device.
@@ -1804,8 +1811,12 @@ int md_integrity_register(mddev_t *mddev)
 			mdname(mddev));
 		return -EINVAL;
 	}
-	printk(KERN_NOTICE "md: data integrity on %s enabled\n",
-		mdname(mddev));
+	printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
+	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+		printk(KERN_ERR "md: failed to create integrity pool for %s\n",
+		       mdname(mddev));
+		return -EINVAL;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(md_integrity_register);
@@ -3159,6 +3170,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = mddev->new_chunk_sectors;
 	mddev->delta_disks = 0;
+	mddev->degraded = 0;
 	if (mddev->pers->sync_request == NULL) {
 		/* this is now an array without redundancy, so
 		 * it must always be in_sync
@@ -4724,7 +4736,6 @@ static void md_clean(mddev_t *mddev)
 	mddev->bitmap_info.chunksize = 0;
 	mddev->bitmap_info.daemon_sleep = 0;
 	mddev->bitmap_info.max_write_behind = 0;
-	mddev->plug = NULL;
 }
 
 static void __md_stop_writes(mddev_t *mddev)
@@ -4817,7 +4828,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		__md_stop_writes(mddev);
 		md_stop(mddev);
 		mddev->queue->merge_bvec_fn = NULL;
-		mddev->queue->unplug_fn = NULL;
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
 		/* tell userspace to handle 'inactive' */
@@ -6268,7 +6278,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 	 * rt is a sector_t, so could be 32bit or 64bit.
 	 * So we divide before multiply in case it is 32bit and close
 	 * to the limit.
-	 * We scale the divisor (db) by 32 to avoid loosing precision
+	 * We scale the divisor (db) by 32 to avoid losing precision
 	 * near the end of resync when the number of remaining sectors
 	 * is close to 'db'.
 	 * We then divide rt by 32 after multiplying by db to compensate.
@@ -6690,14 +6700,6 @@ int md_allow_write(mddev_t *mddev)
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
-void md_unplug(mddev_t *mddev)
-{
-	if (mddev->queue)
-		blk_unplug(mddev->queue);
-	if (mddev->plug)
-		mddev->plug->unplug_fn(mddev->plug);
-}
-
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
 void md_do_sync(mddev_t *mddev)
@@ -6876,7 +6878,6 @@ void md_do_sync(mddev_t *mddev)
 		     >= mddev->resync_max - mddev->curr_resync_completed
 			    )) {
 			/* time to update curr_resync_completed */
-			md_unplug(mddev);
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
@@ -6952,7 +6953,6 @@ void md_do_sync(mddev_t *mddev)
 		 * about not overloading the IO subsystem. (things like an
 		 * e2fsck being done on the RAID array should execute fast)
 		 */
-		md_unplug(mddev);
 		cond_resched();
 
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6971,8 +6971,6 @@ void md_do_sync(mddev_t *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
-	md_unplug(mddev);
-
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 12215d437fc..0b1fd3f1d85 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,26 +29,6 @@
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
-/* generic plugging support - like that provided with request_queue,
- * but does not require a request_queue
- */
-struct plug_handle {
-	void			(*unplug_fn)(struct plug_handle *);
-	struct timer_list	unplug_timer;
-	struct work_struct	unplug_work;
-	unsigned long		unplug_flag;
-};
-#define	PLUGGED_FLAG 1
-void plugger_init(struct plug_handle *plug,
-		  void (*unplug_fn)(struct plug_handle *));
-void plugger_set_plug(struct plug_handle *plug);
-int plugger_remove_plug(struct plug_handle *plug);
-static inline void plugger_flush(struct plug_handle *plug)
-{
-	del_timer_sync(&plug->unplug_timer);
-	cancel_work_sync(&plug->unplug_work);
-}
-
 /*
  * MD's 'extended' device
  */
@@ -94,7 +74,7 @@ struct mdk_rdev_s
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
 #define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occured on an externally
+#define Blocked		8		/* An error occurred on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
 	wait_queue_head_t blocked_wait;
@@ -199,6 +179,9 @@ struct mddev_s
 	int				delta_disks, new_level, new_layout;
 	int				new_chunk_sectors;
 
+	atomic_t			plug_cnt;	/* If device is expecting
+							 * more bios soon.
+							 */
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* last block scheduled */
@@ -336,7 +319,6 @@ struct mddev_s
 	struct list_head		all_mddevs;
 
 	struct attribute_group		*to_remove;
-	struct plug_handle		*plug; /* if used by personality */
 
 	struct bio_set			*bio_set;
 
@@ -516,7 +498,6 @@ extern int md_integrity_register(mddev_t *mddev);
 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 extern void restore_bitmap_write_access(struct file *file);
-extern void md_unplug(mddev_t *mddev);
 
 extern void mddev_init(mddev_t *mddev);
 extern int md_run(mddev_t *mddev);
@@ -530,4 +511,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 				   mddev_t *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   mddev_t *mddev);
+extern int mddev_check_plugged(mddev_t *mddev);
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3a62d440e27..c3589099098 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,36 +106,6 @@ static void multipath_end_request(struct bio *bio, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	multipath_conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)
-		    && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void multipath_unplug(struct request_queue *q)
-{
-	unplug_slaves(q->queuedata);
-}
-
-
 static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
 	multipath_conf_t *conf = mddev->private;
@@ -345,7 +315,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -517,10 +487,12 @@ static int multipath_run (mddev_t *mddev)
 	 */
 	md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
 
-	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
+
+	if (md_integrity_register(mddev))
+		goto out_free_conf;
+
 	return 0;
 
 out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c0ac457f121..e86bf3682e1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
 #include "raid0.h"
 #include "raid5.h"
 
-static void raid0_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	raid0_conf_t *conf = mddev->private;
-	mdk_rdev_t **devlist = conf->devlist;
-	int raid_disks = conf->strip_zone[0].nb_dev;
-	int i;
-
-	for (i=0; i < raid_disks; i++) {
-		struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
-
-		blk_unplug(r_queue);
-	}
-}
-
 static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -272,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 		       mdname(mddev),
 		       (unsigned long long)smallest->sectors);
 	}
-	mddev->queue->unplug_fn = raid0_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
@@ -395,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
 	dump_zones(mddev);
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static int raid0_stop(mddev_t *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 06cd712807d..2b7a7ff401d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
 #define	NR_RAID1_BIOS 256
 
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
-	r1bio_t *r1_bio;
 	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
-	r1_bio = kzalloc(size, gfp_flags);
-	if (!r1_bio && pi->mddev)
-		unplug_slaves(pi->mddev);
-
-	return r1_bio;
+	return kzalloc(size, gfp_flags);
 }
 
 static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,10 +84,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int i, j;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
-	if (!r1_bio) {
-		unplug_slaves(pi->mddev);
+	if (!r1_bio)
 		return NULL;
-	}
 
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
@@ -520,37 +511,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 	return new_disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid1_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	unplug_slaves(mddev);
-	md_wakeup_thread(mddev->thread);
-}
-
 static int raid1_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -580,23 +540,16 @@ static int raid1_congested(void *data, int bits)
 }
 
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
-	 * We return 1 if any requests were actually submitted.
 	 */
-	int rv = 0;
-
 	spin_lock_irq(&conf->device_lock);
 
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
-		/* Only take the spinlock to quiet a warning */
-		spin_lock(conf->mddev->queue->queue_lock);
-		blk_remove_plug(conf->mddev->queue);
-		spin_unlock(conf->mddev->queue->queue_lock);
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to
 		 * disk before proceeding w/ I/O */
@@ -608,10 +561,8 @@ static int flush_pending_writes(conf_t *conf)
 			generic_make_request(bio);
 			bio = next;
 		}
-		rv = 1;
 	} else
 		spin_unlock_irq(&conf->device_lock);
-	return rv;
 }
 
 /* Barriers....
@@ -643,8 +594,7 @@ static void raise_barrier(conf_t *conf)
 
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-			    conf->resync_lock,
-			    raid1_unplug(conf->mddev->queue));
+			    conf->resync_lock, );
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -652,8 +602,7 @@ static void raise_barrier(conf_t *conf)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock,
-			    raid1_unplug(conf->mddev->queue));
+			    conf->resync_lock, );
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -675,7 +624,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    raid1_unplug(conf->mddev->queue));
+				    );
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -711,8 +660,7 @@ static void freeze_array(conf_t *conf)
 	wait_event_lock_irq(conf->wait_barrier,
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
-			    ({ flush_pending_writes(conf);
-			       raid1_unplug(conf->mddev->queue); }));
+			    flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -774,6 +722,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	mdk_rdev_t *blocked_rdev;
+	int plugged;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -865,6 +814,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 */
+	plugged = mddev_check_plugged(mddev);
+
 	disks = conf->raid_disks;
  retry_write:
 	blocked_rdev = NULL;
@@ -962,7 +913,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		atomic_inc(&r1_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
-		blk_plug_device_unlocked(mddev->queue);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
@@ -971,7 +921,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync)
+	if (do_sync || !bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -1178,7 +1128,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -1561,15 +1511,17 @@ static void raid1d(mddev_t *mddev)
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	int unplug=0;
 	mdk_rdev_t *rdev;
+	struct blk_plug plug;
 
 	md_check_recovery(mddev);
-	
+
+	blk_start_plug(&plug);
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		unplug += flush_pending_writes(conf);
+		if (atomic_read(&mddev->plug_cnt) == 0)
+			flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -1583,10 +1535,9 @@ static void raid1d(mddev_t *mddev)
 
 		mddev = r1_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+		if (test_bit(R1BIO_IsSync, &r1_bio->state))
 			sync_request_write(mddev, r1_bio);
-			unplug = 1;
-		} else {
+		else {
 			int disk;
 
 			/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -1636,14 +1587,12 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_end_io = raid1_end_read_request;
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r1_bio;
-				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 		cond_resched();
 	}
-	if (unplug)
-		unplug_slaves(mddev);
+	blk_finish_plug(&plug);
 }
 
 
@@ -2066,11 +2015,9 @@ static int run(mddev_t *mddev)
 
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
-	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static int stop(mddev_t *mddev)
@@ -2092,7 +2039,6 @@ static int stop(mddev_t *mddev)
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
-	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	kfree(conf->mirrors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 747d061d8e0..8e9462626ec 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
  *
  * RAID-10 support for md.
  *
- * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ * Base on code in raid1.c.  See raid1.c for further copyright information.
  *
  *
  * This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
  */
 #define	NR_RAID10_BIOS 256
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
-	r10bio_t *r10_bio;
 	int size = offsetof(struct r10bio_s, devs[conf->copies]);
 
 	/* allocate a r10bio with room for raid_disks entries in the bios array */
-	r10_bio = kzalloc(size, gfp_flags);
-	if (!r10_bio && conf->mddev)
-		unplug_slaves(conf->mddev);
-
-	return r10_bio;
+	return kzalloc(size, gfp_flags);
 }
 
 static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int nalloc;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-	if (!r10_bio) {
-		unplug_slaves(conf->mddev);
+	if (!r10_bio)
 		return NULL;
-	}
 
 	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 		nalloc = conf->copies; /* resync */
@@ -349,14 +340,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
 
 /*
  * RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
  * parameters: near_copies and far_copies.
  * near_copies * far_copies must be <= raid_disks.
  * Normally one of these will be 1.
  * If both are 1, we get raid0.
  * If near_copies == raid_disks, we get raid1.
  *
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
  * first chunk, followed by near_copies copies of the next chunk and
  * so on.
  * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -597,37 +588,6 @@ rb_out:
 	return disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	unplug_slaves(q->queuedata);
-	md_wakeup_thread(mddev->thread);
-}
-
 static int raid10_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -649,23 +609,16 @@ static int raid10_congested(void *data, int bits)
 	return ret;
 }
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
-	 * We return 1 if any requests were actually submitted.
 	 */
-	int rv = 0;
-
 	spin_lock_irq(&conf->device_lock);
 
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
-		/* Spinlock only taken to quiet a warning */
-		spin_lock(conf->mddev->queue->queue_lock);
-		blk_remove_plug(conf->mddev->queue);
-		spin_unlock(conf->mddev->queue->queue_lock);
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
@@ -677,11 +630,10 @@ static int flush_pending_writes(conf_t *conf)
 			generic_make_request(bio);
 			bio = next;
 		}
-		rv = 1;
 	} else
 		spin_unlock_irq(&conf->device_lock);
-	return rv;
 }
+
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -711,17 +663,15 @@ static void raise_barrier(conf_t *conf, int force)
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock,
-			    raid10_unplug(conf->mddev->queue));
+			    conf->resync_lock, );
 
 	/* block any new IO from starting */
 	conf->barrier++;
 
-	/* No wait for all pending IO to complete */
+	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock,
-			    raid10_unplug(conf->mddev->queue));
+			    conf->resync_lock, );
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -742,7 +692,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    raid10_unplug(conf->mddev->queue));
+				    );
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -778,8 +728,8 @@ static void freeze_array(conf_t *conf)
 	wait_event_lock_irq(conf->wait_barrier,
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
-			    ({ flush_pending_writes(conf);
-			       raid10_unplug(conf->mddev->queue); }));
+			    flush_pending_writes(conf));
+
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -806,6 +756,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
+	int plugged;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
@@ -914,6 +865,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 */
+	plugged = mddev_check_plugged(mddev);
+
 	raid10_find_phys(conf, r10_bio);
  retry_write:
 	blocked_rdev = NULL;
@@ -974,7 +927,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		atomic_inc(&r10_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
-		blk_plug_device_unlocked(mddev->queue);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 
@@ -991,9 +943,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync)
+	if (do_sync || !mddev->bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
-
 	return 0;
 }
 
@@ -1233,7 +1184,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -1684,15 +1635,16 @@ static void raid10d(mddev_t *mddev)
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	int unplug=0;
 	mdk_rdev_t *rdev;
+	struct blk_plug plug;
 
 	md_check_recovery(mddev);
 
+	blk_start_plug(&plug);
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		unplug += flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -1706,13 +1658,11 @@ static void raid10d(mddev_t *mddev)
 
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+		if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
-			unplug = 1;
-		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
-			unplug = 1;
-		} else {
+		else {
 			int mirror;
 			/* we got a read error. Maybe the drive is bad.  Maybe just
 			 * the block and we can fix it.
@@ -1759,14 +1709,12 @@ static void raid10d(mddev_t *mddev)
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = raid10_end_read_request;
-				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 		cond_resched();
 	}
-	if (unplug)
-		unplug_slaves(mddev);
+	blk_finish_plug(&plug);
 }
 
 
@@ -2377,7 +2325,6 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, size);
 	mddev->resync_max_sectors = size;
 
-	mddev->queue->unplug_fn = raid10_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
@@ -2395,7 +2342,10 @@ static int run(mddev_t *mddev)
 
 	if (conf->near_copies < conf->raid_disks)
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-	md_integrity_register(mddev);
+
+	if (md_integrity_register(mddev))
+		goto out_free_conf;
+
 	return 0;
 
 out_free_conf:
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2316ac2e8e2..944b1104d3b 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -17,8 +17,8 @@ struct r10_private_data_s {
 	spinlock_t		device_lock;
 
 	/* geometry */
-	int			near_copies;  /* number of copies layed out raid0 style */
-	int 			far_copies;   /* number of copies layed out
+	int			near_copies;  /* number of copies laid out raid0 style */
+	int 			far_copies;   /* number of copies laid out
 					       * at large strides across drives
 					       */
 	int			far_offset;   /* far_copies are offset by 1 stripe
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78536fdbd87..49bf5f89143 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
  *
  * We group bitmap updates into batches.  Each batch has a number.
  * We may write out several batches at once, but that isn't very important.
- * conf->bm_write is the number of the last batch successfully written.
- * conf->bm_flush is the number of the last batch that was closed to
+ * conf->seq_write is the number of the last batch successfully written.
+ * conf->seq_flush is the number of the last batch that was closed to
  *    new additions.
  * When we discover that we will need to write to any block in a stripe
  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * the number of the batch it will be in. This is seq_flush+1.
  * When we are ready to do a write, if that batch hasn't been written yet,
  *   we plug the array and queue the stripe for later.
  * When an unplug happens, we increment bm_flush, thus closing the current
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		BUG_ON(!list_empty(&sh->lru));
 		BUG_ON(atomic_read(&conf->active_stripes)==0);
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
-			if (test_bit(STRIPE_DELAYED, &sh->state)) {
+			if (test_bit(STRIPE_DELAYED, &sh->state))
 				list_add_tail(&sh->lru, &conf->delayed_list);
-				plugger_set_plug(&conf->plug);
-			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-				   sh->bm_seq - conf->seq_write > 0) {
+			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+				   sh->bm_seq - conf->seq_write > 0)
 				list_add_tail(&sh->lru, &conf->bitmap_list);
-				plugger_set_plug(&conf->plug);
-			} else {
+			else {
 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
 			}
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf)
 	return 0;
 }
 
-static void unplug_slaves(mddev_t *mddev);
-
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
 		  int previous, int noblock, int noquiesce)
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    md_raid5_unplug_device(conf)
-					);
+						    );
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, previous);
@@ -1473,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    !list_empty(&conf->inactive_list),
 				    conf->device_lock,
-				    unplug_slaves(conf->mddev)
-			);
+				    );
 		osh = get_free_stripe(conf);
 		spin_unlock_irq(&conf->device_lock);
 		atomic_set(&nsh->count, 1);
@@ -3627,8 +3621,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
 				atomic_inc(&conf->preread_active_stripes);
 			list_add_tail(&sh->lru, &conf->hold_list);
 		}
-	} else
-		plugger_set_plug(&conf->plug);
+	}
 }
 
 static void activate_bit_delay(raid5_conf_t *conf)
@@ -3645,60 +3638,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
 	}
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev->private;
-	int i;
-	int devs = max(conf->raid_disks, conf->previous_raid_disks);
-
-	rcu_read_lock();
-	for (i = 0; i < devs; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-void md_raid5_unplug_device(raid5_conf_t *conf)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&conf->device_lock, flags);
-
-	if (plugger_remove_plug(&conf->plug)) {
-		conf->seq_flush++;
-		raid5_activate_delayed(conf);
-	}
-	md_wakeup_thread(conf->mddev->thread);
-
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-
-	unplug_slaves(conf->mddev);
-}
-EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
-
-static void raid5_unplug(struct plug_handle *plug)
-{
-	raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
-	md_raid5_unplug_device(conf);
-}
-
-static void raid5_unplug_queue(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	md_raid5_unplug_device(mddev->private);
-}
-
 int md_raid5_congested(mddev_t *mddev, int bits)
 {
 	raid5_conf_t *conf = mddev->private;
@@ -3988,6 +3927,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
 	int remaining;
+	int plugged;
 
 	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bi);
@@ -4006,6 +3946,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
+	plugged = mddev_check_plugged(mddev);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks, data_disks;
@@ -4100,7 +4041,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
-				md_raid5_unplug_device(conf);
+				md_wakeup_thread(mddev->thread);
 				release_stripe(sh);
 				schedule();
 				goto retry;
@@ -4120,6 +4061,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 		}
 			
 	}
+	if (!plugged)
+		md_wakeup_thread(mddev->thread);
+
 	spin_lock_irq(&conf->device_lock);
 	remaining = raid5_dec_bi_phys_segments(bi);
 	spin_unlock_irq(&conf->device_lock);
@@ -4365,7 +4309,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
-		unplug_slaves(mddev);
 
 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
 			end_reshape(conf);
@@ -4522,24 +4465,30 @@ static void raid5d(mddev_t *mddev)
 	struct stripe_head *sh;
 	raid5_conf_t *conf = mddev->private;
 	int handled;
+	struct blk_plug plug;
 
 	pr_debug("+++ raid5d active\n");
 
 	md_check_recovery(mddev);
 
+	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct bio *bio;
 
-		if (conf->seq_flush != conf->seq_write) {
-			int seq = conf->seq_flush;
+		if (atomic_read(&mddev->plug_cnt) == 0 &&
+		    !list_empty(&conf->bitmap_list)) {
+			/* Now is a good time to flush some bitmap updates */
+			conf->seq_flush++;
 			spin_unlock_irq(&conf->device_lock);
 			bitmap_unplug(mddev->bitmap);
 			spin_lock_irq(&conf->device_lock);
-			conf->seq_write = seq;
+			conf->seq_write = conf->seq_flush;
 			activate_bit_delay(conf);
 		}
+		if (atomic_read(&mddev->plug_cnt) == 0)
+			raid5_activate_delayed(conf);
 
 		while ((bio = remove_bio_from_retry(conf))) {
 			int ok;
@@ -4569,7 +4518,7 @@ static void raid5d(mddev_t *mddev)
 	spin_unlock_irq(&conf->device_lock);
 
 	async_tx_issue_pending_all();
-	unplug_slaves(mddev);
+	blk_finish_plug(&plug);
 
 	pr_debug("--- raid5d inactive\n");
 }
@@ -5186,8 +5135,6 @@ static int run(mddev_t *mddev)
 		       mdname(mddev));
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
-	plugger_init(&conf->plug, raid5_unplug);
-	mddev->plug = &conf->plug;
 	if (mddev->queue) {
 		int chunk_size;
 		/* read-ahead size must cover two whole stripes, which
@@ -5204,7 +5151,6 @@ static int run(mddev_t *mddev)
 
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-		mddev->queue->unplug_fn = raid5_unplug_queue;
 
 		chunk_size = mddev->chunk_sectors << 9;
 		blk_queue_io_min(mddev->queue, chunk_size);
@@ -5237,7 +5183,6 @@ static int stop(mddev_t *mddev)
 	mddev->thread = NULL;
 	if (mddev->queue)
 		mddev->queue->backing_dev_info.congested_fn = NULL;
-	plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
 	free_conf(conf);
 	mddev->private = NULL;
 	mddev->to_remove = &raid5_attrs_group;
@@ -5733,6 +5678,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 {
 	struct raid0_private_data *raid0_priv = mddev->private;
+	sector_t sectors;
 
 	/* for raid0 takeover only one zone is supported */
 	if (raid0_priv->nr_strip_zones > 1) {
@@ -5741,6 +5687,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 		return ERR_PTR(-EINVAL);
 	}
 
+	sectors = raid0_priv->strip_zone[0].zone_end;
+	sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
+	mddev->dev_sectors = sectors;
 	mddev->new_level = level;
 	mddev->new_layout = ALGORITHM_PARITY_N;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2ace0582b40..3ca77a2613b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -400,8 +400,6 @@ struct raid5_private_data {
 					    * Cleared when a sync completes.
 					    */
 
-	struct plug_handle	plug;
-
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
@@ -503,6 +501,6 @@ static inline int algorithm_is_DDF(int layout)
 }
 
 extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_unplug_device(raid5_conf_t *conf);
+extern void md_raid5_kick_device(raid5_conf_t *conf);
 extern int raid5_set_cache_size(mddev_t *mddev, int size);
 #endif