Merge branch 'perf/core' into perf/uprobes

Merge in latest upstream (and the latest perf development tree), to prepare for tooling changes, and also to pick up v3.4 MM changes that the uprobes code needs to take care of. Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2012-04-14 13:18:27 +0200
committer: Ingo Molnar <mingo@kernel.org> 2012-04-14 13:19:04 +0200
commit: 6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e (patch)
tree: 021cc9f6b477146fcebe6f3be4752abfa2ba18a9 /drivers/md
parent: 682968e0c425c60f0dde37977e5beb2b12ddc4cc (diff)
parent: a385ec4f11bdcf81af094c03e2444ee9b7fad2e5 (diff)
41 files changed, 2499 insertions, 842 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index faa4741df6d..10f122a3a85 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,8 +277,8 @@ config DM_MIRROR
          needed for live data migration tools such as 'pvmove'.
 
 config DM_RAID
-       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       tristate "RAID 1/4/5/6 target"
+       depends on BLK_DEV_DM
        select MD_RAID1
        select MD_RAID456
        select BLK_DEV_MD
@@ -359,8 +359,8 @@ config DM_DELAY
 	If unsure, say N.
 
 config DM_UEVENT
-	bool "DM uevents (EXPERIMENTAL)"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	bool "DM uevents"
+	depends on BLK_DEV_DM
 	---help---
 	Generate udev events for DM events.
 
@@ -370,4 +370,24 @@ config DM_FLAKEY
        ---help---
          A target that intermittently fails I/O for debugging purposes.
 
+config DM_VERITY
+	tristate "Verity target support (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	select CRYPTO
+	select CRYPTO_HASH
+	select DM_BUFIO
+	---help---
+	  This device-mapper target creates a read-only device that
+	  transparently validates the data on one underlying device against
+	  a pre-generated tree of cryptographic checksums stored on a second
+	  device.
+
+	  You'll need to activate the digests you're going to use in the
+	  cryptoapi configuration.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-verity.
+
+	  If unsure, say N.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 046860c7a16..8b2e0dffe82 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 obj-$(CONFIG_DM_RAID)	+= dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
+obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cdf36b1e9aa..3d0dfa7a89a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
+#include <linux/seq_file.h>
 #include "md.h"
 #include "bitmap.h"
 
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
 }
 
 /*
- * just a placeholder - calls kmalloc for bitmap pages
- */
-static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
-{
-	unsigned char *page;
-
-	page = kzalloc(PAGE_SIZE, GFP_NOIO);
-	if (!page)
-		printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
-	else
-		pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
-			 bmname(bitmap), page);
-	return page;
-}
-
-/*
- * for now just a placeholder -- just calls kfree for bitmap pages
- */
-static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
-{
-	pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
-	kfree(page);
-}
-
-/*
  * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
  *
  * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
 	/* this page has not been allocated yet */
 
 	spin_unlock_irq(&bitmap->lock);
-	mappage = bitmap_alloc_page(bitmap);
+	mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
 	spin_lock_irq(&bitmap->lock);
 
 	if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
 	} else if (bitmap->bp[page].map ||
 		   bitmap->bp[page].hijacked) {
 		/* somebody beat us to getting the page */
-		bitmap_free_page(bitmap, mappage);
+		kfree(mappage);
 		return 0;
 	} else {
 
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
 		ptr = bitmap->bp[page].map;
 		bitmap->bp[page].map = NULL;
 		bitmap->missing_pages++;
-		bitmap_free_page(bitmap, ptr);
+		kfree(ptr);
 	}
 }
 
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
 		did_alloc = 1;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (! test_bit(In_sync, &rdev->flags)
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
@@ -445,19 +421,14 @@ out:
 void bitmap_update_sb(struct bitmap *bitmap)
 {
 	bitmap_super_t *sb;
-	unsigned long flags;
 
 	if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
 		return;
 	if (bitmap->mddev->bitmap_info.external)
 		return;
-	spin_lock_irqsave(&bitmap->lock, flags);
-	if (!bitmap->sb_page) { /* no superblock */
-		spin_unlock_irqrestore(&bitmap->lock, flags);
+	if (!bitmap->sb_page) /* no superblock */
 		return;
-	}
-	spin_unlock_irqrestore(&bitmap->lock, flags);
-	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
 	if (bitmap->mddev->events < bitmap->events_cleared)
 		/* rocking back to read-only */
@@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	/* Just in case these have been changed via sysfs: */
 	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
 	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
-	kunmap_atomic(sb, KM_USER0);
+	kunmap_atomic(sb);
 	write_page(bitmap, bitmap->sb_page, 1);
 }
 
@@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 
 	if (!bitmap || !bitmap->sb_page)
 		return;
-	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page);
 	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
 	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
 	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
@@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	printk(KERN_DEBUG "     sync size: %llu KB\n",
 			(unsigned long long)le64_to_cpu(sb->sync_size)/2);
 	printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
-	kunmap_atomic(sb, KM_USER0);
+	kunmap_atomic(sb);
 }
 
 /*
@@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	}
 	bitmap->sb_page->index = 0;
 
-	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page);
 
 	sb->magic = cpu_to_le32(BITMAP_MAGIC);
 	sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	chunksize = bitmap->mddev->bitmap_info.chunksize;
 	BUG_ON(!chunksize);
 	if (!is_power_of_2(chunksize)) {
-		kunmap_atomic(sb, KM_USER0);
+		kunmap_atomic(sb);
 		printk(KERN_ERR "bitmap chunksize not a power of 2\n");
 		return -EINVAL;
 	}
@@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	bitmap->flags |= BITMAP_HOSTENDIAN;
 	sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
 
-	kunmap_atomic(sb, KM_USER0);
+	kunmap_atomic(sb);
 
 	return 0;
 }
@@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		return err;
 	}
 
-	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page);
 
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	/* keep the array size field of the bitmap superblock up to date */
 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
 
-	if (!bitmap->mddev->persistent)
-		goto success;
-
-	/*
-	 * if we have a persistent array superblock, compare the
-	 * bitmap's UUID and event counter to the mddev's
-	 */
-	if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
-		printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
-			bmname(bitmap));
-		goto out;
-	}
-	events = le64_to_cpu(sb->events);
-	if (events < bitmap->mddev->events) {
-		printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
-			"-- forcing full recovery\n", bmname(bitmap), events,
-			(unsigned long long) bitmap->mddev->events);
-		sb->state |= cpu_to_le32(BITMAP_STALE);
+	if (bitmap->mddev->persistent) {
+		/*
+		 * We have a persistent array superblock, so compare the
+		 * bitmap's UUID and event counter to the mddev's
+		 */
+		if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+			printk(KERN_INFO
+			       "%s: bitmap superblock UUID mismatch\n",
+			       bmname(bitmap));
+			goto out;
+		}
+		events = le64_to_cpu(sb->events);
+		if (events < bitmap->mddev->events) {
+			printk(KERN_INFO
+			       "%s: bitmap file is out of date (%llu < %llu) "
+			       "-- forcing full recovery\n",
+			       bmname(bitmap), events,
+			       (unsigned long long) bitmap->mddev->events);
+			sb->state |= cpu_to_le32(BITMAP_STALE);
+		}
 	}
-success:
+
 	/* assign fields using values from superblock */
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -664,7 +637,7 @@ success:
 		bitmap->events_cleared = bitmap->mddev->events;
 	err = 0;
 out:
-	kunmap_atomic(sb, KM_USER0);
+	kunmap_atomic(sb);
 	if (err)
 		bitmap_print_sb(bitmap);
 	return err;
@@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 			     enum bitmap_mask_op op)
 {
 	bitmap_super_t *sb;
-	unsigned long flags;
 	int old;
 
-	spin_lock_irqsave(&bitmap->lock, flags);
-	if (!bitmap->sb_page) { /* can't set the state */
-		spin_unlock_irqrestore(&bitmap->lock, flags);
+	if (!bitmap->sb_page) /* can't set the state */
 		return 0;
-	}
-	spin_unlock_irqrestore(&bitmap->lock, flags);
-	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+	sb = kmap_atomic(bitmap->sb_page);
 	old = le32_to_cpu(sb->state) & bits;
 	switch (op) {
 	case MASK_SET:
@@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 	default:
 		BUG();
 	}
-	kunmap_atomic(sb, KM_USER0);
+	kunmap_atomic(sb);
 	return old;
 }
 
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	unsigned long bit;
 	struct page *page;
 	void *kaddr;
-	unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
+	unsigned long chunk = block >> bitmap->chunkshift;
 
 	if (!bitmap->filemap)
 		return;
@@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	bit = file_page_offset(bitmap, chunk);
 
 	/* set the bit */
-	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr = kmap_atomic(page);
 	if (bitmap->flags & BITMAP_HOSTENDIAN)
 		set_bit(bit, kaddr);
 	else
 		__set_bit_le(bit, kaddr);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 	pr_debug("set file bit %lu page %lu\n", bit, page->index);
 	/* record page number so it gets flushed to disk when unplug occurs */
 	set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
@@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				 * if bitmap is out of date, dirty the
 				 * whole page and write it out
 				 */
-				paddr = kmap_atomic(page, KM_USER0);
+				paddr = kmap_atomic(page);
 				memset(paddr + offset, 0xff,
 				       PAGE_SIZE - offset);
-				kunmap_atomic(paddr, KM_USER0);
+				kunmap_atomic(paddr);
 				write_page(bitmap, page, 1);
 
 				ret = -EIO;
@@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 					goto err;
 			}
 		}
-		paddr = kmap_atomic(page, KM_USER0);
+		paddr = kmap_atomic(page);
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
 			b = test_bit(bit, paddr);
 		else
 			b = test_bit_le(bit, paddr);
-		kunmap_atomic(paddr, KM_USER0);
+		kunmap_atomic(paddr);
 		if (b) {
 			/* if the disk bit is set, set the memory bit */
-			int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+			int needed = ((sector_t)(i+1) << bitmap->chunkshift
 				      >= start);
 			bitmap_set_memory_bits(bitmap,
-					       (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+					       (sector_t)i << bitmap->chunkshift,
 					       needed);
 			bit_cnt++;
 		}
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
 
 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
 {
-	sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+	sector_t chunk = offset >> bitmap->chunkshift;
 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
 	bitmap->bp[page].count += inc;
 	bitmap_checkfree(bitmap, page);
@@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev)
 			    mddev->bitmap_info.external == 0) {
 				bitmap_super_t *sb;
 				bitmap->need_sync = 0;
-				sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+				sb = kmap_atomic(bitmap->sb_page);
 				sb->events_cleared =
 					cpu_to_le64(bitmap->events_cleared);
-				kunmap_atomic(sb, KM_USER0);
+				kunmap_atomic(sb);
 				write_page(bitmap, bitmap->sb_page, 1);
 			}
 			spin_lock_irqsave(&bitmap->lock, flags);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 				bitmap->allclean = 0;
 		}
 		bmc = bitmap_get_counter(bitmap,
-					 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+					 (sector_t)j << bitmap->chunkshift,
 					 &blocks, 0);
 		if (!bmc)
 			j |= PAGE_COUNTER_MASK;
@@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev)
 				/* we can clear the bit */
 				*bmc = 0;
 				bitmap_count_page(bitmap,
-						  (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+						  (sector_t)j << bitmap->chunkshift,
 						  -1);
 
 				/* clear the bit */
-				paddr = kmap_atomic(page, KM_USER0);
+				paddr = kmap_atomic(page);
 				if (bitmap->flags & BITMAP_HOSTENDIAN)
 					clear_bit(file_page_offset(bitmap, j),
 						  paddr);
@@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 						file_page_offset(bitmap,
 								 j),
 						paddr);
-				kunmap_atomic(paddr, KM_USER0);
+				kunmap_atomic(paddr);
 			} else if (*bmc <= 2) {
 				*bmc = 1; /* maybe clear the bit next time */
 				set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
 	 * The lock must have been taken with interrupts enabled.
 	 * If !create, we don't release the lock.
 	 */
-	sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+	sector_t chunk = offset >> bitmap->chunkshift;
 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
 	unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
 	sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
 
 	if (bitmap->bp[page].hijacked ||
 	    bitmap->bp[page].map == NULL)
-		csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
+		csize = ((sector_t)1) << (bitmap->chunkshift +
 					  PAGE_COUNTER_SHIFT - 1);
 	else
-		csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
+		csize = ((sector_t)1) << bitmap->chunkshift;
 	*blocks = csize - (offset & (csize - 1));
 
 	if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 			set_page_attr(bitmap,
 				      filemap_get_page(
 					      bitmap,
-					      offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+					      offset >> bitmap->chunkshift),
 				      BITMAP_PAGE_PENDING);
 			bitmap->allclean = 0;
 		}
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
 		else {
 			if (*bmc <= 2) {
 				set_page_attr(bitmap,
-					      filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+					      filemap_get_page(bitmap, offset >> bitmap->chunkshift),
 					      BITMAP_PAGE_PENDING);
 				bitmap->allclean = 0;
 			}
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 
 	bitmap->mddev->curr_resync_completed = sector;
 	set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
-	sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
+	sector &= ~((1ULL << bitmap->chunkshift) - 1);
 	s = 0;
 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
 		bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 		struct page *page;
 		*bmc = 2 | (needed ? NEEDED_MASK : 0);
 		bitmap_count_page(bitmap, offset, 1);
-		page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
+		page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
 		set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
 		bitmap->allclean = 0;
 	}
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 	unsigned long chunk;
 
 	for (chunk = s; chunk <= e; chunk++) {
-		sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
+		sector_t sec = (sector_t)chunk << bitmap->chunkshift;
 		bitmap_set_memory_bits(bitmap, sec, 1);
 		spin_lock_irq(&bitmap->lock);
 		bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
 		goto error;
 
 	bitmap->daemon_lastrun = jiffies;
-	bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
+	bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
+			      - BITMAP_BLOCK_SHIFT);
 
 	/* now that chunksize and chunkshift are set, we can use these macros */
-	chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
-			CHUNK_BLOCK_SHIFT(bitmap);
+	chunks = (blocks + bitmap->chunkshift - 1) >>
+			bitmap->chunkshift;
 	pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
 
 	BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
 
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+{
+	unsigned long chunk_kb;
+	unsigned long flags;
+
+	if (!bitmap)
+		return;
+
+	spin_lock_irqsave(&bitmap->lock, flags);
+	chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
+	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+		   "%lu%s chunk",
+		   bitmap->pages - bitmap->missing_pages,
+		   bitmap->pages,
+		   (bitmap->pages - bitmap->missing_pages)
+		   << (PAGE_SHIFT - 10),
+		   chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
+		   chunk_kb ? "KB" : "B");
+	if (bitmap->file) {
+		seq_printf(seq, ", file: ");
+		seq_path(seq, &bitmap->file->f_path, " \t\n");
+	}
+
+	seq_printf(seq, "\n");
+	spin_unlock_irqrestore(&bitmap->lock, flags);
+}
+
 static ssize_t
 location_show(struct mddev *mddev, char *page)
 {
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 			if (mddev->pers) {
 				mddev->pers->quiesce(mddev, 1);
 				rv = bitmap_create(mddev);
+				if (!rv)
+					rv = bitmap_load(mddev);
 				if (rv) {
 					bitmap_destroy(mddev);
 					mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3..55ca5aec84e 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
 #define BITMAP_MAJOR_HI 4
 #define	BITMAP_MAJOR_HOSTENDIAN 3
 
-#define BITMAP_MINOR 39
-
 /*
  * in-memory bitmap:
  *
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
 /* same, except a mask value for more efficient bitops */
 #define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
 
-#define BITMAP_BLOCK_SIZE 512
 #define BITMAP_BLOCK_SHIFT 9
 
 /* how many blocks per chunk? (this is variable) */
 #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
-
-/* when hijacked, the counters and bits represent even larger "chunks" */
-/* there will be 1024 chunks represented by each counter in the page pointers */
-#define PAGEPTR_BLOCK_RATIO(bitmap) \
-			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
-#define PAGEPTR_BLOCK_SHIFT(bitmap) \
-			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
-#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
 
 #endif
 
@@ -181,12 +168,6 @@ struct bitmap_page {
 	unsigned int  count:31;
 };
 
-/* keep track of bitmap file pages that have pending writes on them */
-struct page_list {
-	struct list_head list;
-	struct page *page;
-};
-
 /* the main bitmap structure - one per mddev */
 struct bitmap {
 	struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
 	struct mddev *mddev; /* the md device that the bitmap is for */
 
 	/* bitmap chunksize -- how much data does each bit represent? */
-	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+	unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
 	unsigned long chunks; /* total number of data chunks for the array */
 
 	__u64	events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
 
 void bitmap_print_sb(struct bitmap *bitmap);
 void bitmap_update_sb(struct bitmap *bitmap);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
 
 int  bitmap_setallbits(struct bitmap *bitmap);
 void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0a6806f80ab..cc06a1e5242 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -12,7 +12,6 @@
 #include <linux/dm-io.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/version.h>
 #include <linux/shrinker.h>
 #include <linux/module.h>
 
@@ -579,7 +578,7 @@ static void write_endio(struct bio *bio, int error)
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
 	b->write_error = error;
-	if (error) {
+	if (unlikely(error)) {
 		struct dm_bufio_client *c = b->c;
 		(void)cmpxchg(&c->async_write_error, 0, error);
 	}
@@ -698,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
 	dm_bufio_lock(c);
 }
 
+enum new_flag {
+	NF_FRESH = 0,
+	NF_READ = 1,
+	NF_GET = 2,
+	NF_PREFETCH = 3
+};
+
 /*
  * Allocate a new buffer. If the allocation is not possible, wait until
  * some other thread frees a buffer.
  *
  * May drop the lock and regain it.
  */
-static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c)
+static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
 {
 	struct dm_buffer *b;
 
@@ -727,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 				return b;
 		}
 
+		if (nf == NF_PREFETCH)
+			return NULL;
+
 		if (!list_empty(&c->reserved_buffers)) {
 			b = list_entry(c->reserved_buffers.next,
 				       struct dm_buffer, lru_list);
@@ -744,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 	}
 }
 
-static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c)
+static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
 {
-	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c);
+	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
+
+	if (!b)
+		return NULL;
 
 	if (c->alloc_callback)
 		c->alloc_callback(b);
@@ -866,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
  * Getting a buffer
  *--------------------------------------------------------------*/
 
-enum new_flag {
-	NF_FRESH = 0,
-	NF_READ = 1,
-	NF_GET = 2
-};
-
 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
-				     enum new_flag nf, struct dm_buffer **bp,
-				     int *need_submit)
+				     enum new_flag nf, int *need_submit)
 {
 	struct dm_buffer *b, *new_b = NULL;
 
 	*need_submit = 0;
 
 	b = __find(c, block);
-	if (b) {
-		b->hold_count++;
-		__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
-			     test_bit(B_WRITING, &b->state));
-		return b;
-	}
+	if (b)
+		goto found_buffer;
 
 	if (nf == NF_GET)
 		return NULL;
 
-	new_b = __alloc_buffer_wait(c);
+	new_b = __alloc_buffer_wait(c, nf);
+	if (!new_b)
+		return NULL;
 
 	/*
 	 * We've had a period where the mutex was unlocked, so need to
@@ -900,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 	b = __find(c, block);
 	if (b) {
 		__free_buffer_wake(new_b);
-		b->hold_count++;
-		__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
-			     test_bit(B_WRITING, &b->state));
-		return b;
+		goto found_buffer;
 	}
 
 	__check_watermark(c);
@@ -923,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 	*need_submit = 1;
 
 	return b;
+
+found_buffer:
+	if (nf == NF_PREFETCH)
+		return NULL;
+	/*
+	 * Note: it is essential that we don't wait for the buffer to be
+	 * read if dm_bufio_get function is used. Both dm_bufio_get and
+	 * dm_bufio_prefetch can be used in the driver request routine.
+	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
+	 * the same buffer, it would deadlock if we waited.
+	 */
+	if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
+		return NULL;
+
+	b->hold_count++;
+	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
+		     test_bit(B_WRITING, &b->state));
+	return b;
 }
 
 /*
@@ -957,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 	struct dm_buffer *b;
 
 	dm_bufio_lock(c);
-	b = __bufio_new(c, block, nf, bp, &need_submit);
+	b = __bufio_new(c, block, nf, &need_submit);
 	dm_bufio_unlock(c);
 
-	if (!b || IS_ERR(b))
+	if (!b)
 		return b;
 
 	if (need_submit)
@@ -1006,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
 }
 EXPORT_SYMBOL_GPL(dm_bufio_new);
 
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+		       sector_t block, unsigned n_blocks)
+{
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+	dm_bufio_lock(c);
+
+	for (; n_blocks--; block++) {
+		int need_submit;
+		struct dm_buffer *b;
+		b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+		if (unlikely(b != NULL)) {
+			dm_bufio_unlock(c);
+
+			if (need_submit)
+				submit_io(b, READ, b->block, read_endio);
+			dm_bufio_release(b);
+
+			dm_bufio_cond_resched();
+
+			if (!n_blocks)
+				goto flush_plug;
+			dm_bufio_lock(c);
+		}
+
+	}
+
+	dm_bufio_unlock(c);
+
+flush_plug:
+	blk_finish_plug(&plug);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
+
 void dm_bufio_release(struct dm_buffer *b)
 {
 	struct dm_bufio_client *c = b->c;
 
 	dm_bufio_lock(c);
 
-	BUG_ON(test_bit(B_READING, &b->state));
 	BUG_ON(!b->hold_count);
 
 	b->hold_count--;
@@ -1025,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b)
 		 * invalid buffer.
 		 */
 		if ((b->read_error || b->write_error) &&
+		    !test_bit(B_READING, &b->state) &&
 		    !test_bit(B_WRITING, &b->state) &&
 		    !test_bit(B_DIRTY, &b->state)) {
 			__unlink_buffer(b);
@@ -1042,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
 
 	dm_bufio_lock(c);
 
+	BUG_ON(test_bit(B_READING, &b->state));
+
 	if (!test_and_set_bit(B_DIRTY, &b->state))
 		__relink_lru(b, LIST_DIRTY);
 
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 5c4c3a04e38..b142946a9e3 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
 		   struct dm_buffer **bp);
 
 /*
+ * Prefetch the specified blocks to the cache.
+ * The function starts to read the blocks and returns without waiting for
+ * I/O to finish.
+ */
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+		       sector_t block, unsigned n_blocks);
+
+/*
  * Release a reference obtained with dm_bufio_{read,get,new}. The data
  * pointer and dm_buffer pointer is no longer valid after this call.
  */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8c2a000cf3f..3f06df59fd8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -176,7 +176,6 @@ struct crypt_config {
 
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
-#define MIN_BIO_PAGES  8
 
 static struct kmem_cache *_crypt_io_pool;
 
@@ -590,9 +589,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
 	int r = 0;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
-		src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+		src = kmap_atomic(sg_page(&dmreq->sg_in));
 		r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
-		kunmap_atomic(src, KM_USER0);
+		kunmap_atomic(src);
 	} else
 		memset(iv, 0, cc->iv_size);
 
@@ -608,14 +607,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
 		return 0;
 
-	dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+	dst = kmap_atomic(sg_page(&dmreq->sg_out));
 	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
 
 	/* Tweak the first block of plaintext sector */
 	if (!r)
 		crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
 
-	kunmap_atomic(dst, KM_USER0);
+	kunmap_atomic(dst);
 	return r;
 }
 
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 		}
 
 		/*
-		 * if additional pages cannot be allocated without waiting,
-		 * return a partially allocated bio, the caller will then try
-		 * to allocate additional bios while submitting this partial bio
+		 * If additional pages cannot be allocated without waiting,
+		 * return a partially-allocated bio.  The caller will then try
+		 * to allocate more bios while submitting this partial bio.
 		 */
-		if (i == (MIN_BIO_PAGES - 1))
-			gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+		gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
 
 		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
 
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
 	queue_work(cc->io_queue, &io->work);
 }
 
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
-					  int error, int async)
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->target->private;
 
-	if (unlikely(error < 0)) {
+	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
 		bio_put(clone);
-		io->error = -EIO;
 		crypt_dec_pending(io);
 		return;
 	}
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		sector += bio_sectors(clone);
 
 		crypt_inc_pending(io);
+
 		r = crypt_convert(cc, &io->ctx);
+		if (r < 0)
+			io->error = -EIO;
+
 		crypt_finished = atomic_dec_and_test(&io->ctx.pending);
 
 		/* Encryption was already finished, submit io now */
 		if (crypt_finished) {
-			kcryptd_crypt_write_io_submit(io, r, 0);
+			kcryptd_crypt_write_io_submit(io, 0);
 
 			/*
 			 * If there was an error, do not try next fragments.
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	crypt_dec_pending(io);
 }
 
-static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
+static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
 {
-	if (unlikely(error < 0))
-		io->error = -EIO;
-
 	crypt_dec_pending(io);
 }
 
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 			   io->sector);
 
 	r = crypt_convert(cc, &io->ctx);
+	if (r < 0)
+		io->error = -EIO;
 
 	if (atomic_dec_and_test(&io->ctx.pending))
-		kcryptd_crypt_read_done(io, r);
+		kcryptd_crypt_read_done(io);
 
 	crypt_dec_pending(io);
 }
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
 
+	if (error < 0)
+		io->error = -EIO;
+
 	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
 
 	if (!atomic_dec_and_test(&ctx->pending))
 		return;
 
 	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_crypt_read_done(io, error);
+		kcryptd_crypt_read_done(io);
 	else
-		kcryptd_crypt_write_io_submit(io, error, 1);
+		kcryptd_crypt_write_io_submit(io, 1);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
 	char *cipher_api = NULL;
 	int cpu, ret = -EINVAL;
+	char dummy;
 
 	/* Convert to crypto api definition? */
 	if (strchr(cipher_in, '(')) {
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
 	if (!keycount)
 		cc->tfms_count = 1;
-	else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+	else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
 		 !is_power_of_2(cc->tfms_count)) {
 		ti->error = "Bad cipher key count specification";
 		return -EINVAL;
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	int ret;
 	struct dm_arg_set as;
 	const char *opt_string;
+	char dummy;
 
 	static struct dm_arg _args[] = {
 		{0, 1, "Invalid number of feature args"},
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ret = -EINVAL;
-	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
+	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid iv_offset sector";
 		goto bad;
 	}
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+	if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid device sector";
 		goto bad;
 	}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f18375dcedd..2dc22dddb2a 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct delay_c *dc;
 	unsigned long long tmpll;
+	char dummy;
 
 	if (argc != 3 && argc != 6) {
 		ti->error = "requires exactly 3 or 6 arguments";
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	dc->reads = dc->writes = 0;
 
-	if (sscanf(argv[1], "%llu", &tmpll) != 1) {
+	if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid device sector";
 		goto bad;
 	}
 	dc->start_read = tmpll;
 
-	if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
+	if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
 		ti->error = "Invalid delay";
 		goto bad;
 	}
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (argc == 3)
 		goto out;
 
-	if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+	if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid write device sector";
 		goto bad_dev_read;
 	}
 	dc->start_write = tmpll;
 
-	if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
+	if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
 		ti->error = "Invalid write delay";
 		goto bad_dev_read;
 	}
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 042e7199656..aa70f7d43a1 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -283,7 +283,7 @@ int dm_exception_store_init(void)
 	return 0;
 
 persistent_fail:
-	dm_persistent_snapshot_exit();
+	dm_transient_snapshot_exit();
 transient_fail:
 	return r;
 }
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9fb18c14782..ac49c01f1a4 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	unsigned long long tmpll;
 	struct dm_arg_set as;
 	const char *devname;
+	char dummy;
 
 	as.argc = argc;
 	as.argv = argv;
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	devname = dm_shift_arg(&as);
 
-	if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+	if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid device sector";
 		goto bad;
 	}
@@ -323,7 +324,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
 	 * Corrupt successful READs while in down state.
 	 * If flags were specified, only corrupt those that match.
 	 */
-	if (!error && bio_submitted_while_down &&
+	if (fc->corrupt_bio_byte && !error && bio_submitted_while_down &&
 	    (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
 	    all_corrupt_bio_flags_match(bio, fc))
 		corrupt_bio_data(bio, fc);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ad2eba40e31..ea5dd289fe2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 	unsigned offset;
 	unsigned num_bvecs;
 	sector_t remaining = where->count;
+	struct request_queue *q = bdev_get_queue(where->bdev);
+	sector_t discard_sectors;
 
 	/*
 	 * where->count may be zero if rw holds a flush and we need to
@@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		/*
 		 * Allocate a suitably sized-bio.
 		 */
-		num_bvecs = dm_sector_div_up(remaining,
-					     (PAGE_SIZE >> SECTOR_SHIFT));
-		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
+		if (rw & REQ_DISCARD)
+			num_bvecs = 1;
+		else
+			num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
+					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 		bio->bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
@@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		bio->bi_destructor = dm_bio_destructor;
 		store_io_and_region_in_bio(bio, io, region);
 
-		/*
-		 * Try and add as many pages as possible.
-		 */
-		while (remaining) {
+		if (rw & REQ_DISCARD) {
+			discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+			bio->bi_size = discard_sectors << SECTOR_SHIFT;
+			remaining -= discard_sectors;
+		} else while (remaining) {
+			/*
+			 * Try and add as many pages as possible.
+			 */
 			dp->get_page(dp, &page, &len, &offset);
 			len = min(len, to_bytes(remaining));
 			if (!bio_add_page(bio, page, len, offset))
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 31c2dc25886..a1a3e6df17b 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
 	struct hd_geometry geometry;
 	unsigned long indata[4];
 	char *geostr = (char *) param + param->data_start;
+	char dummy;
 
 	md = find_device(param);
 	if (!md)
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
-	x = sscanf(geostr, "%lu %lu %lu %lu", indata,
-		   indata + 1, indata + 2, indata + 3);
+	x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
+		   indata + 1, indata + 2, indata + 3, &dummy);
 
 	if (x != 4) {
 		DMWARN("Unable to interpret geometry settings.");
@@ -1437,7 +1438,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 
 	if (!argc) {
 		DMWARN("Empty message received.");
-		goto out;
+		goto out_argv;
 	}
 
 	table = dm_get_live_table(md);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9728839f844..3639eeab604 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct linear_c *lc;
 	unsigned long long tmp;
+	char dummy;
 
 	if (argc != 2) {
 		ti->error = "Invalid argument count";
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
-	if (sscanf(argv[1], "%llu", &tmp) != 1) {
+	if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
 		ti->error = "dm-linear: Invalid device sector";
 		goto bad;
 	}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 3b52bb72bd1..65ebaebf502 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	unsigned int region_count;
 	size_t bitset_size, buf_size;
 	int r;
+	char dummy;
 
 	if (argc < 1 || argc > 2) {
 		DMWARN("wrong number of arguments to dirty region log");
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		}
 	}
 
-	if (sscanf(argv[0], "%u", &region_size) != 1 ||
+	if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
 	    !_check_region_size(ti, region_size)) {
 		DMWARN("invalid region size %s", argv[0]);
 		return -EINVAL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 801d92d237c..922a3385eea 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m)
 	kfree(m);
 }
 
+static int set_mapinfo(struct multipath *m, union map_info *info)
+{
+	struct dm_mpath_io *mpio;
+
+	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+	if (!mpio)
+		return -ENOMEM;
+
+	memset(mpio, 0, sizeof(*mpio));
+	info->ptr = mpio;
+
+	return 0;
+}
+
+static void clear_mapinfo(struct multipath *m, union map_info *info)
+{
+	struct dm_mpath_io *mpio = info->ptr;
+
+	info->ptr = NULL;
+	mempool_free(mpio, m->mpio_pool);
+}
 
 /*-----------------------------------------------
  * Path selection
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m)
 }
 
 static int map_io(struct multipath *m, struct request *clone,
-		  struct dm_mpath_io *mpio, unsigned was_queued)
+		  union map_info *map_context, unsigned was_queued)
 {
 	int r = DM_MAPIO_REMAPPED;
 	size_t nr_bytes = blk_rq_bytes(clone);
 	unsigned long flags;
 	struct pgpath *pgpath;
 	struct block_device *bdev;
+	struct dm_mpath_io *mpio = map_context->ptr;
 
 	spin_lock_irqsave(&m->lock, flags);
 
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m)
 {
 	int r;
 	unsigned long flags;
-	struct dm_mpath_io *mpio;
 	union map_info *info;
 	struct request *clone, *n;
 	LIST_HEAD(cl);
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m)
 		list_del_init(&clone->queuelist);
 
 		info = dm_get_rq_mapinfo(clone);
-		mpio = info->ptr;
 
-		r = map_io(m, clone, mpio, 1);
+		r = map_io(m, clone, info, 1);
 		if (r < 0) {
-			mempool_free(mpio, m->mpio_pool);
+			clear_mapinfo(m, info);
 			dm_kill_unmapped_request(clone, r);
 		} else if (r == DM_MAPIO_REMAPPED)
 			dm_dispatch_request(clone);
 		else if (r == DM_MAPIO_REQUEUE) {
-			mempool_free(mpio, m->mpio_pool);
+			clear_mapinfo(m, info);
 			dm_requeue_unmapped_request(clone);
 		}
 	}
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
 			 union map_info *map_context)
 {
 	int r;
-	struct dm_mpath_io *mpio;
 	struct multipath *m = (struct multipath *) ti->private;
 
-	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
-	if (!mpio)
+	if (set_mapinfo(m, map_context) < 0)
 		/* ENOMEM, requeue */
 		return DM_MAPIO_REQUEUE;
-	memset(mpio, 0, sizeof(*mpio));
 
-	map_context->ptr = mpio;
 	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-	r = map_io(m, clone, mpio, 0);
+	r = map_io(m, clone, map_context, 0);
 	if (r < 0 || r == DM_MAPIO_REQUEUE)
-		mempool_free(mpio, m->mpio_pool);
+		clear_mapinfo(m, map_context);
 
 	return r;
 }
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
 	struct priority_group *pg;
 	unsigned pgnum;
 	unsigned long flags;
+	char dummy;
 
-	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+	if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
 	    (pgnum > m->nr_priority_groups)) {
 		DMWARN("invalid PG number supplied to switch_pg_num");
 		return -EINVAL;
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
 {
 	struct priority_group *pg;
 	unsigned pgnum;
+	char dummy;
 
-	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+	if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
 	    (pgnum > m->nr_priority_groups)) {
 		DMWARN("invalid PG number supplied to bypass_pg");
 		return -EINVAL;
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	struct path_selector *ps;
 	int r;
 
+	BUG_ON(!mpio);
+
 	r  = do_end_io(m, clone, error, mpio);
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
-	mempool_free(mpio, m->mpio_pool);
+	clear_mapinfo(m, map_context);
 
 	return r;
 }
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 03a837aa5ce..3941fae0de9 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
 	struct selector *s = ps->context;
 	struct path_info *pi;
 	unsigned repeat_count = QL_MIN_IO;
+	char dummy;
 
 	/*
 	 * Arguments: [<repeat_count>]
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
 		return -EINVAL;
 	}
 
-	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+	if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
 		*error = "queue-length ps: invalid repeat count";
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 86cb7e5d83d..b0ba52459ed 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 		return 0;
 
 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
-		DMERR("Failed to read device superblock");
+		DMERR("Failed to read superblock of device at position %d",
+		      rdev->raid_disk);
+		set_bit(Faulty, &rdev->flags);
 		return -EINVAL;
 	}
 
@@ -615,14 +617,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
-	struct md_rdev *r, *t;
+	struct md_rdev *r;
 	uint64_t failed_devices;
 	struct dm_raid_superblock *sb;
 
 	sb = page_address(rdev->sb_page);
 	failed_devices = le64_to_cpu(sb->failed_devices);
 
-	rdev_for_each(r, t, mddev)
+	rdev_for_each(r, mddev)
 		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
 			failed_devices |= (1ULL << r->raid_disk);
 
@@ -668,7 +670,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		return ret;
 
 	sb = page_address(rdev->sb_page);
-	if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+
+	/*
+	 * Two cases that we want to write new superblocks and rebuild:
+	 * 1) New device (no matching magic number)
+	 * 2) Device specified for rebuild (!In_sync w/ offset == 0)
+	 */
+	if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
+	    (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
 		super_sync(rdev->mddev, rdev);
 
 		set_bit(FirstUse, &rdev->flags);
@@ -700,7 +709,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	struct dm_raid_superblock *sb;
 	uint32_t new_devs = 0;
 	uint32_t rebuilds = 0;
-	struct md_rdev *r, *t;
+	struct md_rdev *r;
 	struct dm_raid_superblock *sb2;
 
 	sb = page_address(rdev->sb_page);
@@ -743,13 +752,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	 *    case the In_sync bit will /not/ be set and
 	 *    recovery_cp must be MaxSector.
 	 */
-	rdev_for_each(r, t, mddev) {
+	rdev_for_each(r, mddev) {
 		if (!test_bit(In_sync, &r->flags)) {
-			if (!test_bit(FirstUse, &r->flags))
-				DMERR("Superblock area of "
-				      "rebuild device %d should have been "
-				      "cleared.", r->raid_disk);
-			set_bit(FirstUse, &r->flags);
+			DMINFO("Device %d specified for rebuild: "
+			       "Clearing superblock", r->raid_disk);
 			rebuilds++;
 		} else if (test_bit(FirstUse, &r->flags))
 			new_devs++;
@@ -778,7 +784,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	 * Now we set the Faulty bit for those devices that are
 	 * recorded in the superblock as failed.
 	 */
-	rdev_for_each(r, t, mddev) {
+	rdev_for_each(r, mddev) {
 		if (!r->sb_page)
 			continue;
 		sb2 = page_address(r->sb_page);
@@ -851,11 +857,27 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
 	int ret;
-	struct md_rdev *rdev, *freshest, *tmp;
+	unsigned redundancy = 0;
+	struct raid_dev *dev;
+	struct md_rdev *rdev, *freshest;
 	struct mddev *mddev = &rs->md;
 
+	switch (rs->raid_type->level) {
+	case 1:
+		redundancy = rs->md.raid_disks - 1;
+		break;
+	case 4:
+	case 5:
+	case 6:
+		redundancy = rs->raid_type->parity_devs;
+		break;
+	default:
+		ti->error = "Unknown RAID type";
+		return -EINVAL;
+	}
+
 	freshest = NULL;
-	rdev_for_each(rdev, tmp, mddev) {
+	rdev_for_each(rdev, mddev) {
 		if (!rdev->meta_bdev)
 			continue;
 
@@ -868,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		case 0:
 			break;
 		default:
+			dev = container_of(rdev, struct raid_dev, rdev);
+			if (redundancy--) {
+				if (dev->meta_dev)
+					dm_put_device(ti, dev->meta_dev);
+
+				dev->meta_dev = NULL;
+				rdev->meta_bdev = NULL;
+
+				if (rdev->sb_page)
+					put_page(rdev->sb_page);
+
+				rdev->sb_page = NULL;
+
+				rdev->sb_loaded = 0;
+
+				/*
+				 * We might be able to salvage the data device
+				 * even though the meta device has failed.  For
+				 * now, we behave as though '- -' had been
+				 * set for this device in the table.
+				 */
+				if (dev->data_dev)
+					dm_put_device(ti, dev->data_dev);
+
+				dev->data_dev = NULL;
+				rdev->bdev = NULL;
+
+				list_del(&rdev->same_set);
+
+				continue;
+			}
 			ti->error = "Failed to load superblock";
 			return ret;
 		}
@@ -884,7 +937,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	if (super_validate(mddev, freshest))
 		return -EINVAL;
 
-	rdev_for_each(rdev, tmp, mddev)
+	rdev_for_each(rdev, mddev)
 		if ((rdev != freshest) && super_validate(mddev, rdev))
 			return -EINVAL;
 
@@ -971,6 +1024,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
+	ti->num_flush_requests = 1;
 
 	mutex_lock(&rs->md.reconfig_mutex);
 	ret = md_run(&rs->md);
@@ -1209,7 +1263,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057be68..d039de8322f 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 		      unsigned int mirror, char **argv)
 {
 	unsigned long long offset;
+	char dummy;
 
-	if (sscanf(argv[1], "%llu", &offset) != 1) {
+	if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
 		ti->error = "Invalid offset";
 		return -EINVAL;
 	}
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 {
 	unsigned param_count;
 	struct dm_dirty_log *dl;
+	char dummy;
 
 	if (argc < 2) {
 		ti->error = "Insufficient mirror log arguments";
 		return NULL;
 	}
 
-	if (sscanf(argv[1], "%u", &param_count) != 1) {
+	if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
 		ti->error = "Invalid mirror log argument count";
 		return NULL;
 	}
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 {
 	unsigned num_features;
 	struct dm_target *ti = ms->ti;
+	char dummy;
 
 	*args_used = 0;
 
 	if (!argc)
 		return 0;
 
-	if (sscanf(argv[0], "%u", &num_features) != 1) {
+	if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
 		ti->error = "Invalid number of features";
 		return -EINVAL;
 	}
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	unsigned int nr_mirrors, m, args_used;
 	struct mirror_set *ms;
 	struct dm_dirty_log *dl;
+	char dummy;
 
 	dl = create_dirty_log(ti, argc, argv, &args_used);
 	if (!dl)
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	argv += args_used;
 	argc -= args_used;
 
-	if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
+	if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
 	    nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
 		ti->error = "Invalid number of mirrors";
 		dm_dirty_log_destroy(dl);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 27f1d423b76..6ab1192cdd5 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
 	struct selector *s = (struct selector *) ps->context;
 	struct path_info *pi;
 	unsigned repeat_count = RR_MIN_IO;
+	char dummy;
 
 	if (argc > 1) {
 		*error = "round-robin ps: incorrect number of arguments";
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
 	}
 
 	/* First path argument is number of I/Os before switching path */
-	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+	if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
 		*error = "round-robin ps: invalid repeat count";
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 59883bd7821..9df8f6bd641 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
 	struct path_info *pi;
 	unsigned repeat_count = ST_MIN_IO;
 	unsigned relative_throughput = 1;
+	char dummy;
 
 	/*
 	 * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
 		return -EINVAL;
 	}
 
-	if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
 		*error = "service-time ps: invalid repeat count";
 		return -EINVAL;
 	}
 
 	if ((argc == 2) &&
-	    (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+	    (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
 	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
 		*error = "service-time ps: invalid relative_throughput value";
 		return -EINVAL;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3d80cf0c152..35c94ff24ad 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
 		      unsigned int stripe, char **argv)
 {
 	unsigned long long start;
+	char dummy;
 
-	if (sscanf(argv[1], "%llu", &start) != 1)
+	if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
 		return -EINVAL;
 
 	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 63cc54289af..2e227fbf162 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t)
 	vfree(t->highs);
 
 	/* free the device list */
-	if (t->devices.next != &t->devices)
-		free_devices(&t->devices);
+	free_devices(&t->devices);
 
 	dm_free_md_mempools(t->mempools);
 
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 	struct dm_dev_internal *dd;
 	unsigned int major, minor;
 	struct dm_table *t = ti->table;
+	char dummy;
 
 	BUG_ON(!t);
 
-	if (sscanf(path, "%u:%u", &major, &minor) == 2) {
+	if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
 		/* Extract the major/minor numbers */
 		dev = MKDEV(major, minor);
 		if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
 			     unsigned *value, char **error, unsigned grouped)
 {
 	const char *arg_str = dm_shift_arg(arg_set);
+	char dummy;
 
 	if (!arg_str ||
-	    (sscanf(arg_str, "%u", value) != 1) ||
+	    (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
 	    (*value < arg->min) ||
 	    (*value > arg->max) ||
 	    (grouped && arg_set->argc < *value)) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 59c4f0446ff..737d38865b6 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
 		data_sm = dm_sm_disk_create(tm, nr_blocks);
 		if (IS_ERR(data_sm)) {
 			DMERR("sm_disk_create failed");
+			dm_tm_unlock(tm, sblock);
 			r = PTR_ERR(data_sm);
 			goto bad;
 		}
@@ -613,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	if (r < 0)
 		goto out;
 
-	r = dm_sm_root_size(pmd->metadata_sm, &data_len);
+	r = dm_sm_root_size(pmd->data_sm, &data_len);
 	if (r < 0)
 		goto out;
 
@@ -712,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 	if (r)
 		goto bad;
 
+	if (bdev_size > THIN_METADATA_MAX_SECTORS)
+		bdev_size = THIN_METADATA_MAX_SECTORS;
+
 	disk_super = dm_block_data(sblock);
 	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
 	disk_super->version = cpu_to_le32(THIN_VERSION);
@@ -789,6 +793,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 	return 0;
 }
 
+/*
+ * __open_device: Returns @td corresponding to device with id @dev,
+ * creating it if @create is set and incrementing @td->open_count.
+ * On failure, @td is undefined.
+ */
 static int __open_device(struct dm_pool_metadata *pmd,
 			 dm_thin_id dev, int create,
 			 struct dm_thin_device **td)
@@ -799,10 +808,16 @@ static int __open_device(struct dm_pool_metadata *pmd,
 	struct disk_device_details details_le;
 
 	/*
-	 * Check the device isn't already open.
+	 * If the device is already open, return it.
 	 */
 	list_for_each_entry(td2, &pmd->thin_devices, list)
 		if (td2->id == dev) {
+			/*
+			 * May not create an already-open device.
+			 */
+			if (create)
+				return -EEXIST;
+
 			td2->open_count++;
 			*td = td2;
 			return 0;
@@ -817,6 +832,9 @@ static int __open_device(struct dm_pool_metadata *pmd,
 		if (r != -ENODATA || !create)
 			return r;
 
+		/*
+		 * Create new device.
+		 */
 		changed = 1;
 		details_le.mapped_blocks = 0;
 		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
@@ -882,12 +900,10 @@ static int __create_thin(struct dm_pool_metadata *pmd,
 
 	r = __open_device(pmd, dev, 1, &td);
 	if (r) {
-		__close_device(td);
 		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
 		dm_btree_del(&pmd->bl_info, dev_root);
 		return r;
 	}
-	td->changed = 1;
 	__close_device(td);
 
 	return r;
@@ -967,14 +983,14 @@ static int __create_snap(struct dm_pool_metadata *pmd,
 		goto bad;
 
 	r = __set_snapshot_details(pmd, td, origin, pmd->time);
+	__close_device(td);
+
 	if (r)
 		goto bad;
 
-	__close_device(td);
 	return 0;
 
 bad:
-	__close_device(td);
 	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
 	dm_btree_remove(&pmd->details_info, pmd->details_root,
 			&key, &pmd->details_root);
@@ -1211,6 +1227,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
 	if (r)
 		return r;
 
+	td->mapped_blocks--;
+	td->changed = 1;
 	pmd->need_commit = 1;
 
 	return 0;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c1689687..ed4725e67c9 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -11,6 +11,19 @@
 
 #define THIN_METADATA_BLOCK_SIZE 4096
 
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about 16k metadata blocks.
+ */
+#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
 /*----------------------------------------------------------------*/
 
 struct dm_pool_metadata;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef..213ae32a0fc 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
 #define DEFERRED_SET_SIZE 64
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
+#define COMMIT_PERIOD HZ
 
 /*
  * The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 
 /*
- * The metadata device is currently limited in size.  The limitation is
- * checked lower down in dm-space-map-metadata, but we also check it here
- * so we can fail early.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
- */
-#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
-
-/*
  * Device id is restricted to 24 bits.
  */
 #define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
  * missed out if the io covers the block. (schedule_copy).
  *
  * iv) insert the new mapping into the origin's btree
- * (process_prepared_mappings).  This act of inserting breaks some
+ * (process_prepared_mapping).  This act of inserting breaks some
  * sharing of btree nodes between the two devices.  Breaking sharing only
  * effects the btree of that specific device.  Btrees for the other
  * devices that share the block never change.  The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
 	struct hlist_node list;
 	struct bio_prison *prison;
 	struct cell_key key;
-	unsigned count;
+	struct bio *holder;
 	struct bio_list bios;
 };
 
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
  * This may block if a new cell needs allocating.  You must ensure that
  * cells will be unlocked even if the calling thread is blocked.
  *
- * Returns the number of entries in the cell prior to the new addition
- * or < 0 on failure.
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
  */
 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
 		      struct bio *inmate, struct cell **ref)
 {
-	int r;
+	int r = 1;
 	unsigned long flags;
 	uint32_t hash = hash_key(prison, key);
-	struct cell *uninitialized_var(cell), *cell2 = NULL;
+	struct cell *cell, *cell2;
 
 	BUG_ON(hash > prison->nr_buckets);
 
 	spin_lock_irqsave(&prison->lock, flags);
+
 	cell = __search_bucket(prison->cells + hash, key);
+	if (cell) {
+		bio_list_add(&cell->bios, inmate);
+		goto out;
+	}
 
-	if (!cell) {
-		/*
-		 * Allocate a new cell
-		 */
-		spin_unlock_irqrestore(&prison->lock, flags);
-		cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
-		spin_lock_irqsave(&prison->lock, flags);
+	/*
+	 * Allocate a new cell
+	 */
+	spin_unlock_irqrestore(&prison->lock, flags);
+	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
+	spin_lock_irqsave(&prison->lock, flags);
 
-		/*
-		 * We've been unlocked, so we have to double check that
-		 * nobody else has inserted this cell in the meantime.
-		 */
-		cell = __search_bucket(prison->cells + hash, key);
+	/*
+	 * We've been unlocked, so we have to double check that
+	 * nobody else has inserted this cell in the meantime.
+	 */
+	cell = __search_bucket(prison->cells + hash, key);
+	if (cell) {
+		mempool_free(cell2, prison->cell_pool);
+		bio_list_add(&cell->bios, inmate);
+		goto out;
+	}
 
-		if (!cell) {
-			cell = cell2;
-			cell2 = NULL;
+	/*
+	 * Use new cell.
+	 */
+	cell = cell2;
 
-			cell->prison = prison;
-			memcpy(&cell->key, key, sizeof(cell->key));
-			cell->count = 0;
-			bio_list_init(&cell->bios);
-			hlist_add_head(&cell->list, prison->cells + hash);
-		}
-	}
+	cell->prison = prison;
+	memcpy(&cell->key, key, sizeof(cell->key));
+	cell->holder = inmate;
+	bio_list_init(&cell->bios);
+	hlist_add_head(&cell->list, prison->cells + hash);
 
-	r = cell->count++;
-	bio_list_add(&cell->bios, inmate);
-	spin_unlock_irqrestore(&prison->lock, flags);
+	r = 0;
 
-	if (cell2)
-		mempool_free(cell2, prison->cell_pool);
+out:
+	spin_unlock_irqrestore(&prison->lock, flags);
 
 	*ref = cell;
 
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
 
 	hlist_del(&cell->list);
 
-	if (inmates)
-		bio_list_merge(inmates, &cell->bios);
+	bio_list_add(inmates, cell->holder);
+	bio_list_merge(inmates, &cell->bios);
 
 	mempool_free(cell, prison->cell_pool);
 }
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
  * bio may be in the cell.  This function releases the cell, and also does
  * a sanity check.
  */
+static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+{
+	hlist_del(&cell->list);
+	BUG_ON(cell->holder != bio);
+	BUG_ON(!bio_list_empty(&cell->bios));
+}
+
 static void cell_release_singleton(struct cell *cell, struct bio *bio)
 {
-	struct bio_prison *prison = cell->prison;
-	struct bio_list bios;
-	struct bio *b;
 	unsigned long flags;
-
-	bio_list_init(&bios);
+	struct bio_prison *prison = cell->prison;
 
 	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release(cell, &bios);
+	__cell_release_singleton(cell, bio);
 	spin_unlock_irqrestore(&prison->lock, flags);
+}
+
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+	struct bio_prison *prison = cell->prison;
+
+	hlist_del(&cell->list);
+	bio_list_merge(inmates, &cell->bios);
 
-	b = bio_list_pop(&bios);
-	BUG_ON(b != bio);
-	BUG_ON(!bio_list_empty(&bios));
+	mempool_free(cell, prison->cell_pool);
+}
+
+static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+	unsigned long flags;
+	struct bio_prison *prison = cell->prison;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release_no_holder(cell, inmates);
+	spin_unlock_irqrestore(&prison->lock, flags);
 }
 
 static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
  * devices.
  */
 struct new_mapping;
+
+struct pool_features {
+	unsigned zero_new_blocks:1;
+	unsigned discard_enabled:1;
+	unsigned discard_passdown:1;
+};
+
 struct pool {
 	struct list_head list;
 	struct dm_target *ti;	/* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
 	dm_block_t offset_mask;
 	dm_block_t low_water_blocks;
 
-	unsigned zero_new_blocks:1;
+	struct pool_features pf;
 	unsigned low_water_triggered:1;	/* A dm event has been sent */
 	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
 
@@ -493,17 +518,21 @@ struct pool {
 
 	struct workqueue_struct *wq;
 	struct work_struct worker;
+	struct delayed_work waker;
 
 	unsigned ref_count;
+	unsigned long last_commit_jiffies;
 
 	spinlock_t lock;
 	struct bio_list deferred_bios;
 	struct bio_list deferred_flush_bios;
 	struct list_head prepared_mappings;
+	struct list_head prepared_discards;
 
 	struct bio_list retry_on_resume_list;
 
-	struct deferred_set ds;	/* FIXME: move to thin_c */
+	struct deferred_set shared_read_ds;
+	struct deferred_set all_io_ds;
 
 	struct new_mapping *next_mapping;
 	mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
 	struct dm_target_callbacks callbacks;
 
 	dm_block_t low_water_blocks;
-	unsigned zero_new_blocks:1;
+	struct pool_features pf;
 };
 
 /*
@@ -529,6 +558,7 @@ struct pool_c {
  */
 struct thin_c {
 	struct dm_dev *pool_dev;
+	struct dm_dev *origin_dev;
 	dm_thin_id dev_id;
 
 	struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
 
 /*----------------------------------------------------------------*/
 
+struct endio_hook {
+	struct thin_c *tc;
+	struct deferred_entry *shared_read_entry;
+	struct deferred_entry *all_io_entry;
+	struct new_mapping *overwrite_mapping;
+};
+
 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
 	struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 	bio_list_init(master);
 
 	while ((bio = bio_list_pop(&bios))) {
-		if (dm_get_mapinfo(bio)->ptr == tc)
+		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		if (h->tc == tc)
 			bio_endio(bio, DM_ENDIO_REQUEUE);
 		else
 			bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 		(bio->bi_sector & pool->offset_mask);
 }
 
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
-			    dm_block_t block)
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+	bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
-	remap(tc, bio, block);
-
 	/*
 	 * Batch together any FUA/FLUSH bios we find and then issue
 	 * a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 		generic_make_request(bio);
 }
 
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+	remap_to_origin(tc, bio);
+	issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+			    dm_block_t block)
+{
+	remap(tc, bio, block);
+	issue(tc, bio);
+}
+
 /*
  * wake_worker() is used when new work is queued and when pool_resume is
  * ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
 /*
  * Bio endio functions.
  */
-struct endio_hook {
-	struct thin_c *tc;
-	bio_end_io_t *saved_bi_end_io;
-	struct deferred_entry *entry;
-};
-
 struct new_mapping {
 	struct list_head list;
 
-	int prepared;
+	unsigned quiesced:1;
+	unsigned prepared:1;
+	unsigned pass_discard:1;
 
 	struct thin_c *tc;
 	dm_block_t virt_block;
 	dm_block_t data_block;
-	struct cell *cell;
+	struct cell *cell, *cell2;
 	int err;
 
 	/*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
 {
 	struct pool *pool = m->tc->pool;
 
-	if (list_empty(&m->list) && m->prepared) {
+	if (m->quiesced && m->prepared) {
 		list_add(&m->list, &pool->prepared_mappings);
 		wake_worker(pool);
 	}
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 static void overwrite_endio(struct bio *bio, int err)
 {
 	unsigned long flags;
-	struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+	struct new_mapping *m = h->overwrite_mapping;
 	struct pool *pool = m->tc->pool;
 
 	m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void shared_read_endio(struct bio *bio, int err)
-{
-	struct list_head mappings;
-	struct new_mapping *m, *tmp;
-	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
-	unsigned long flags;
-	struct pool *pool = h->tc->pool;
-
-	bio->bi_end_io = h->saved_bi_end_io;
-	bio_endio(bio, err);
-
-	INIT_LIST_HEAD(&mappings);
-	ds_dec(h->entry, &mappings);
-
-	spin_lock_irqsave(&pool->lock, flags);
-	list_for_each_entry_safe(m, tmp, &mappings, list) {
-		list_del(&m->list);
-		INIT_LIST_HEAD(&m->list);
-		__maybe_add_mapping(m);
-	}
-	spin_unlock_irqrestore(&pool->lock, flags);
-
-	mempool_free(h, pool->endio_hook_pool);
-}
-
 /*----------------------------------------------------------------*/
 
 /*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
  * Same as cell_defer above, except it omits one particular detainee,
  * a write bio that covers the block and has already been processed.
  */
-static void cell_defer_except(struct thin_c *tc, struct cell *cell,
-			      struct bio *exception)
+static void cell_defer_except(struct thin_c *tc, struct cell *cell)
 {
 	struct bio_list bios;
-	struct bio *bio;
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
 	bio_list_init(&bios);
-	cell_release(cell, &bios);
 
 	spin_lock_irqsave(&pool->lock, flags);
-	while ((bio = bio_list_pop(&bios)))
-		if (bio != exception)
-			bio_list_add(&pool->deferred_bios, bio);
+	cell_release_no_holder(cell, &pool->deferred_bios);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 	wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
 	 * the bios in the cell.
 	 */
 	if (bio) {
-		cell_defer_except(tc, m->cell, bio);
+		cell_defer_except(tc, m->cell);
 		bio_endio(bio, 0);
 	} else
 		cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+
+	r = dm_thin_remove_block(tc->td, m->virt_block);
+	if (r)
+		DMERR("dm_thin_remove_block() failed");
+
+	/*
+	 * Pass the discard down to the underlying device?
+	 */
+	if (m->pass_discard)
+		remap_and_issue(tc, m->bio, m->data_block);
+	else
+		bio_endio(m->bio, 0);
+
+	cell_defer_except(tc, m->cell);
+	cell_defer_except(tc, m->cell2);
+	mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+			     void (*fn)(struct new_mapping *))
 {
 	unsigned long flags;
 	struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
 
 	INIT_LIST_HEAD(&maps);
 	spin_lock_irqsave(&pool->lock, flags);
-	list_splice_init(&pool->prepared_mappings, &maps);
+	list_splice_init(head, &maps);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 	list_for_each_entry_safe(m, tmp, &maps, list)
-		process_prepared_mapping(m);
+		fn(m);
 }
 
 /*
  * Deferred bio jobs.
  */
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-	return ((bio_data_dir(bio) == WRITE) &&
-		!(bio->bi_sector & pool->offset_mask)) &&
+	return !(bio->bi_sector & pool->offset_mask) &&
 		(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+	return (bio_data_dir(bio) == WRITE) &&
+		io_overlaps_block(pool, bio);
 }
 
 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
 }
 
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
-			  dm_block_t data_origin, dm_block_t data_dest,
+			  struct dm_dev *origin, dm_block_t data_origin,
+			  dm_block_t data_dest,
 			  struct cell *cell, struct bio *bio)
 {
 	int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	struct new_mapping *m = get_next_mapping(pool);
 
 	INIT_LIST_HEAD(&m->list);
+	m->quiesced = 0;
 	m->prepared = 0;
 	m->tc = tc;
 	m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	m->err = 0;
 	m->bio = NULL;
 
-	ds_add_work(&pool->ds, &m->list);
+	if (!ds_add_work(&pool->shared_read_ds, &m->list))
+		m->quiesced = 1;
 
 	/*
 	 * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
 	 */
 	if (io_overwrites_block(pool, bio)) {
+		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		h->overwrite_mapping = m;
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-		dm_get_mapinfo(bio)->ptr = m;
 		remap_and_issue(tc, bio, data_dest);
 	} else {
 		struct dm_io_region from, to;
 
-		from.bdev = tc->pool_dev->bdev;
+		from.bdev = origin->bdev;
 		from.sector = data_origin * pool->sectors_per_block;
 		from.count = pool->sectors_per_block;
 
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	}
 }
 
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+				   dm_block_t data_origin, dm_block_t data_dest,
+				   struct cell *cell, struct bio *bio)
+{
+	schedule_copy(tc, virt_block, tc->pool_dev,
+		      data_origin, data_dest, cell, bio);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+				   dm_block_t data_dest,
+				   struct cell *cell, struct bio *bio)
+{
+	schedule_copy(tc, virt_block, tc->origin_dev,
+		      virt_block, data_dest, cell, bio);
+}
+
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 			  dm_block_t data_block, struct cell *cell,
 			  struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	struct new_mapping *m = get_next_mapping(pool);
 
 	INIT_LIST_HEAD(&m->list);
+	m->quiesced = 1;
 	m->prepared = 0;
 	m->tc = tc;
 	m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	 * zeroing pre-existing data, we can issue the bio immediately.
 	 * Otherwise we use kcopyd to zero the data first.
 	 */
-	if (!pool->zero_new_blocks)
+	if (!pool->pf.zero_new_blocks)
 		process_prepared_mapping(m);
 
 	else if (io_overwrites_block(pool, bio)) {
+		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		h->overwrite_mapping = m;
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-		dm_get_mapinfo(bio)->ptr = m;
 		remap_and_issue(tc, bio, data_block);
 
 	} else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
  */
 static void retry_on_resume(struct bio *bio)
 {
-	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+	struct thin_c *tc = h->tc;
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
 		retry_on_resume(bio);
 }
 
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+	int r;
+	struct pool *pool = tc->pool;
+	struct cell *cell, *cell2;
+	struct cell_key key, key2;
+	dm_block_t block = get_bio_block(tc, bio);
+	struct dm_thin_lookup_result lookup_result;
+	struct new_mapping *m;
+
+	build_virtual_key(tc->td, block, &key);
+	if (bio_detain(tc->pool->prison, &key, bio, &cell))
+		return;
+
+	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+	switch (r) {
+	case 0:
+		/*
+		 * Check nobody is fiddling with this pool block.  This can
+		 * happen if someone's in the process of breaking sharing
+		 * on this block.
+		 */
+		build_data_key(tc->td, lookup_result.block, &key2);
+		if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+			cell_release_singleton(cell, bio);
+			break;
+		}
+
+		if (io_overlaps_block(pool, bio)) {
+			/*
+			 * IO may still be going to the destination block.  We must
+			 * quiesce before we can do the removal.
+			 */
+			m = get_next_mapping(pool);
+			m->tc = tc;
+			m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
+			m->virt_block = block;
+			m->data_block = lookup_result.block;
+			m->cell = cell;
+			m->cell2 = cell2;
+			m->err = 0;
+			m->bio = bio;
+
+			if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+				list_add(&m->list, &pool->prepared_discards);
+				wake_worker(pool);
+			}
+		} else {
+			/*
+			 * This path is hit if people are ignoring
+			 * limits->discard_granularity.  It ignores any
+			 * part of the discard that is in a subsequent
+			 * block.
+			 */
+			sector_t offset = bio->bi_sector - (block << pool->block_shift);
+			unsigned remaining = (pool->sectors_per_block - offset) << 9;
+			bio->bi_size = min(bio->bi_size, remaining);
+
+			cell_release_singleton(cell, bio);
+			cell_release_singleton(cell2, bio);
+			remap_and_issue(tc, bio, lookup_result.block);
+		}
+		break;
+
+	case -ENODATA:
+		/*
+		 * It isn't provisioned, just forget it.
+		 */
+		cell_release_singleton(cell, bio);
+		bio_endio(bio, 0);
+		break;
+
+	default:
+		DMERR("discard: find block unexpectedly returned %d", r);
+		cell_release_singleton(cell, bio);
+		bio_io_error(bio);
+		break;
+	}
+}
+
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 			  struct cell_key *key,
 			  struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 	r = alloc_data_block(tc, &data_block);
 	switch (r) {
 	case 0:
-		schedule_copy(tc, block, lookup_result->block,
-			      data_block, cell, bio);
+		schedule_internal_copy(tc, block, lookup_result->block,
+				       data_block, cell, bio);
 		break;
 
 	case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	if (bio_data_dir(bio) == WRITE)
 		break_sharing(tc, bio, block, &key, lookup_result, cell);
 	else {
-		struct endio_hook *h;
-		h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
 
-		h->tc = tc;
-		h->entry = ds_inc(&pool->ds);
-		save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
-		dm_get_mapinfo(bio)->ptr = h;
+		h->shared_read_entry = ds_inc(&pool->shared_read_ds);
 
 		cell_release_singleton(cell, bio);
 		remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	r = alloc_data_block(tc, &data_block);
 	switch (r) {
 	case 0:
-		schedule_zero(tc, block, data_block, cell, bio);
+		if (tc->origin_dev)
+			schedule_external_copy(tc, block, data_block, cell, bio);
+		else
+			schedule_zero(tc, block, data_block, cell, bio);
 		break;
 
 	case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 		break;
 
 	case -ENODATA:
-		provision_block(tc, bio, block, cell);
+		if (bio_data_dir(bio) == READ && tc->origin_dev) {
+			cell_release_singleton(cell, bio);
+			remap_to_origin_and_issue(tc, bio);
+		} else
+			provision_block(tc, bio, block, cell);
 		break;
 
 	default:
 		DMERR("dm_thin_find_block() failed, error = %d", r);
+		cell_release_singleton(cell, bio);
 		bio_io_error(bio);
 		break;
 	}
 }
 
+static int need_commit_due_to_time(struct pool *pool)
+{
+	return jiffies < pool->last_commit_jiffies ||
+	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+}
+
 static void process_deferred_bios(struct pool *pool)
 {
 	unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 	while ((bio = bio_list_pop(&bios))) {
-		struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+		struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct thin_c *tc = h->tc;
+
 		/*
 		 * If we've got no free new_mapping structs, and processing
 		 * this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
 
 			break;
 		}
-		process_bio(tc, bio);
+
+		if (bio->bi_rw & REQ_DISCARD)
+			process_discard(tc, bio);
+		else
+			process_bio(tc, bio);
 	}
 
 	/*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
 	bio_list_init(&pool->deferred_flush_bios);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	if (bio_list_empty(&bios))
+	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
 		return;
 
 	r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
 			bio_io_error(bio);
 		return;
 	}
+	pool->last_commit_jiffies = jiffies;
 
 	while ((bio = bio_list_pop(&bios)))
 		generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 
-	process_prepared_mappings(pool);
+	process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+	process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
 	process_deferred_bios(pool);
 }
 
+/*
+ * We want to commit periodically so that not too much
+ * unwritten data builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
+	wake_worker(pool);
+	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
 	wake_worker(pool);
 }
 
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+	struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+
+	h->tc = tc;
+	h->shared_read_entry = NULL;
+	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
+	h->overwrite_mapping = NULL;
+
+	return h;
+}
+
 /*
  * Non-blocking function called from the thin target's map function.
  */
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 	struct dm_thin_device *td = tc->td;
 	struct dm_thin_lookup_result result;
 
-	/*
-	 * Save the thin context for easy access from the deferred bio later.
-	 */
-	map_context->ptr = tc;
-
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+	map_context->ptr = thin_hook_bio(tc, bio);
+	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
 		thin_defer_bio(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 
 	pool->ti = ti;
 	pool->low_water_blocks = pt->low_water_blocks;
-	pool->zero_new_blocks = pt->zero_new_blocks;
+	pool->pf = pt->pf;
 
 	return 0;
 }
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 /*----------------------------------------------------------------
  * Pool creation
  *--------------------------------------------------------------*/
+/* Initialize pool features. */
+static void pool_features_init(struct pool_features *pf)
+{
+	pf->zero_new_blocks = 1;
+	pf->discard_enabled = 1;
+	pf->discard_passdown = 1;
+}
+
 static void __pool_destroy(struct pool *pool)
 {
 	__pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	pool->block_shift = ffs(block_size) - 1;
 	pool->offset_mask = block_size - 1;
 	pool->low_water_blocks = 0;
-	pool->zero_new_blocks = 1;
+	pool_features_init(&pool->pf);
 	pool->prison = prison_create(PRISON_CELLS);
 	if (!pool->prison) {
 		*error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	}
 
 	INIT_WORK(&pool->worker, do_worker);
+	INIT_DELAYED_WORK(&pool->waker, do_waker);
 	spin_lock_init(&pool->lock);
 	bio_list_init(&pool->deferred_bios);
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
+	INIT_LIST_HEAD(&pool->prepared_discards);
 	pool->low_water_triggered = 0;
 	pool->no_free_space = 0;
 	bio_list_init(&pool->retry_on_resume_list);
-	ds_init(&pool->ds);
+	ds_init(&pool->shared_read_ds);
+	ds_init(&pool->all_io_ds);
 
 	pool->next_mapping = NULL;
 	pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		goto bad_endio_hook_pool;
 	}
 	pool->ref_count = 1;
+	pool->last_commit_jiffies = jiffies;
 	pool->pool_md = pool_md;
 	pool->md_dev = metadata_dev;
 	__pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
 
 static struct pool *__pool_find(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
-				unsigned long block_size, char **error)
+				unsigned long block_size, char **error,
+				int *created)
 {
 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
 				return ERR_PTR(-EINVAL);
 			__pool_inc(pool);
 
-		} else
+		} else {
 			pool = pool_create(pool_md, metadata_dev, block_size, error);
+			*created = 1;
+		}
 	}
 
 	return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
 	mutex_unlock(&dm_thin_pool_table.mutex);
 }
 
-struct pool_features {
-	unsigned zero_new_blocks:1;
-};
-
 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 			       struct dm_target *ti)
 {
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of pool feature arguments"},
+		{0, 3, "Invalid number of pool feature arguments"},
 	};
 
 	/*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 		if (!strcasecmp(arg_name, "skip_block_zeroing")) {
 			pf->zero_new_blocks = 0;
 			continue;
+		} else if (!strcasecmp(arg_name, "ignore_discard")) {
+			pf->discard_enabled = 0;
+			continue;
+		} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+			pf->discard_passdown = 0;
+			continue;
 		}
 
 		ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
  *
  * Optional feature arguments are:
  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ *	     ignore_discard: disable discard
+ *	     no_discard_passdown: don't pass discards down to the data device
  */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
-	int r;
+	int r, pool_created = 0;
 	struct pool_c *pt;
 	struct pool *pool;
 	struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	dm_block_t low_water_blocks;
 	struct dm_dev *metadata_dev;
 	sector_t metadata_dev_size;
+	char b[BDEVNAME_SIZE];
 
 	/*
 	 * FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
-	if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
-		ti->error = "Metadata device is too large";
-		r = -EINVAL;
-		goto out_metadata;
-	}
+	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
+		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+		       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
 
 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
 	if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	/*
 	 * Set default pool features.
 	 */
-	memset(&pf, 0, sizeof(pf));
-	pf.zero_new_blocks = 1;
+	pool_features_init(&pf);
 
 	dm_consume_args(&as, 4);
 	r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-			   block_size, &ti->error);
+			   block_size, &ti->error, &pool_created);
 	if (IS_ERR(pool)) {
 		r = PTR_ERR(pool);
 		goto out_free_pt;
 	}
 
+	/*
+	 * 'pool_created' reflects whether this is the first table load.
+	 * Top level discard support is not allowed to be changed after
+	 * initial load.  This would require a pool reload to trigger thin
+	 * device changes.
+	 */
+	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
+		ti->error = "Discard support cannot be disabled once enabled";
+		r = -EINVAL;
+		goto out_flags_changed;
+	}
+
+	/*
+	 * If discard_passdown was enabled verify that the data device
+	 * supports discards.  Disable discard_passdown if not; otherwise
+	 * -EOPNOTSUPP will be returned.
+	 */
+	if (pf.discard_passdown) {
+		struct request_queue *q = bdev_get_queue(data_dev->bdev);
+		if (!q || !blk_queue_discard(q)) {
+			DMWARN("Discard unsupported by data device: Disabling discard passdown.");
+			pf.discard_passdown = 0;
+		}
+	}
+
 	pt->pool = pool;
 	pt->ti = ti;
 	pt->metadata_dev = metadata_dev;
 	pt->data_dev = data_dev;
 	pt->low_water_blocks = low_water_blocks;
-	pt->zero_new_blocks = pf.zero_new_blocks;
+	pt->pf = pf;
 	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 0;
+	/*
+	 * Only need to enable discards if the pool should pass
+	 * them down to the data device.  The thin device's discard
+	 * processing will cause mappings to be removed from the btree.
+	 */
+	if (pf.discard_enabled && pf.discard_passdown) {
+		ti->num_discard_requests = 1;
+		/*
+		 * Setting 'discards_supported' circumvents the normal
+		 * stacking of discard limits (this keeps the pool and
+		 * thin devices' discard limits consistent).
+		 */
+		ti->discards_supported = 1;
+	}
 	ti->private = pt;
 
 	pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	return 0;
 
+out_flags_changed:
+	__pool_dec(pool);
 out_free_pt:
 	kfree(pt);
 out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
 	__requeue_bios(pool);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	wake_worker(pool);
+	do_waker(&pool->waker.work);
 }
 
 static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 
+	cancel_delayed_work(&pool->waker);
 	flush_workqueue(pool->wq);
 
 	r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 static int pool_status(struct dm_target *ti, status_type_t type,
 		       char *result, unsigned maxlen)
 {
-	int r;
+	int r, count;
 	unsigned sz = 0;
 	uint64_t transaction_id;
 	dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long long)pt->low_water_blocks);
 
-		DMEMIT("%u ", !pool->zero_new_blocks);
+		count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
+			!pool->pf.discard_passdown;
+		DMEMIT("%u ", count);
 
-		if (!pool->zero_new_blocks)
+		if (!pool->pf.zero_new_blocks)
 			DMEMIT("skip_block_zeroing ");
+
+		if (!pool->pf.discard_enabled)
+			DMEMIT("ignore_discard ");
+
+		if (!pool->pf.discard_passdown)
+			DMEMIT("no_discard_passdown ");
+
 		break;
 	}
 
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+	/*
+	 * FIXME: these limits may be incompatible with the pool's data device
+	 */
+	limits->max_discard_sectors = pool->sectors_per_block;
+
+	/*
+	 * This is just a hint, and not enforced.  We have to cope with
+	 * bios that overlap 2 blocks.
+	 */
+	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
+}
+
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, 0);
 	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+	if (pool->pf.discard_enabled)
+		set_discard_limits(pool, limits);
 }
 
 static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
 	__pool_dec(tc->pool);
 	dm_pool_close_thin_device(tc->td);
 	dm_put_device(ti, tc->pool_dev);
+	if (tc->origin_dev)
+		dm_put_device(ti, tc->origin_dev);
 	kfree(tc);
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
 /*
  * Thin target parameters:
  *
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
  *
  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
  * dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
+ *
+ * If the pool device has discards disabled, they get disabled for the thin
+ * device as well.
  */
 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r;
 	struct thin_c *tc;
-	struct dm_dev *pool_dev;
+	struct dm_dev *pool_dev, *origin_dev;
 	struct mapped_device *pool_md;
 
 	mutex_lock(&dm_thin_pool_table.mutex);
 
-	if (argc != 2) {
+	if (argc != 2 && argc != 3) {
 		ti->error = "Invalid argument count";
 		r = -EINVAL;
 		goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto out_unlock;
 	}
 
+	if (argc == 3) {
+		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+		if (r) {
+			ti->error = "Error opening origin device";
+			goto bad_origin_dev;
+		}
+		tc->origin_dev = origin_dev;
+	}
+
 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
 	if (r) {
 		ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	ti->split_io = tc->pool->sectors_per_block;
 	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 0;
-	ti->discards_supported = 0;
+
+	/* In case the pool supports discards, pass them on. */
+	if (tc->pool->pf.discard_enabled) {
+		ti->discards_supported = 1;
+		ti->num_discard_requests = 1;
+	}
 
 	dm_put(pool_md);
 
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
 bad_common:
 	dm_put_device(ti, tc->pool_dev);
 bad_pool_dev:
+	if (tc->origin_dev)
+		dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
 	kfree(tc);
 out_unlock:
 	mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
 static int thin_map(struct dm_target *ti, struct bio *bio,
 		    union map_info *map_context)
 {
-	bio->bi_sector -= ti->begin;
+	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
 
 	return thin_bio_map(ti, bio, map_context);
 }
 
+static int thin_endio(struct dm_target *ti,
+		      struct bio *bio, int err,
+		      union map_info *map_context)
+{
+	unsigned long flags;
+	struct endio_hook *h = map_context->ptr;
+	struct list_head work;
+	struct new_mapping *m, *tmp;
+	struct pool *pool = h->tc->pool;
+
+	if (h->shared_read_entry) {
+		INIT_LIST_HEAD(&work);
+		ds_dec(h->shared_read_entry, &work);
+
+		spin_lock_irqsave(&pool->lock, flags);
+		list_for_each_entry_safe(m, tmp, &work, list) {
+			list_del(&m->list);
+			m->quiesced = 1;
+			__maybe_add_mapping(m);
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+	}
+
+	if (h->all_io_entry) {
+		INIT_LIST_HEAD(&work);
+		ds_dec(h->all_io_entry, &work);
+		list_for_each_entry_safe(m, tmp, &work, list)
+			list_add(&m->list, &pool->prepared_discards);
+	}
+
+	mempool_free(h, pool->endio_hook_pool);
+
+	return 0;
+}
+
 static void thin_postsuspend(struct dm_target *ti)
 {
 	if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("%s %lu",
 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
 			       (unsigned long) tc->dev_id);
+			if (tc->origin_dev)
+				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
 			break;
 		}
 	}
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct thin_c *tc = ti->private;
+	struct pool *pool = tc->pool;
 
 	blk_limits_io_min(limits, 0);
-	blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+	set_discard_limits(pool, limits);
 }
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
+	.end_io = thin_endio,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
 	.iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 00000000000..fa365d39b61
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,913 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ *
+ * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
+ * default prefetch value. Data are read in "prefetch_cluster" chunks from the
+ * hash device. Setting this greatly improves performance when data and hash
+ * are on the same disk on different partitions on devices with poor random
+ * access behavior.
+ */
+
+#include "dm-bufio.h"
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+
+#define DM_MSG_PREFIX			"verity"
+
+#define DM_VERITY_IO_VEC_INLINE		16
+#define DM_VERITY_MEMPOOL_SIZE		4
+#define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
+
+#define DM_VERITY_MAX_LEVELS		63
+
+static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
+
+module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
+
+struct dm_verity {
+	struct dm_dev *data_dev;
+	struct dm_dev *hash_dev;
+	struct dm_target *ti;
+	struct dm_bufio_client *bufio;
+	char *alg_name;
+	struct crypto_shash *tfm;
+	u8 *root_digest;	/* digest of the root block */
+	u8 *salt;		/* salt: its size is salt_size */
+	unsigned salt_size;
+	sector_t data_start;	/* data offset in 512-byte sectors */
+	sector_t hash_start;	/* hash start in blocks */
+	sector_t data_blocks;	/* the number of data blocks */
+	sector_t hash_blocks;	/* the number of hash blocks */
+	unsigned char data_dev_block_bits;	/* log2(data blocksize) */
+	unsigned char hash_dev_block_bits;	/* log2(hash blocksize) */
+	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
+	unsigned char levels;	/* the number of tree levels */
+	unsigned char version;
+	unsigned digest_size;	/* digest size for the current hash algorithm */
+	unsigned shash_descsize;/* the size of temporary space for crypto */
+	int hash_failed;	/* set to 1 if hash of any block failed */
+
+	mempool_t *io_mempool;	/* mempool of struct dm_verity_io */
+	mempool_t *vec_mempool;	/* mempool of bio vector */
+
+	struct workqueue_struct *verify_wq;
+
+	/* starting blocks for each tree level. 0 is the lowest level. */
+	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+};
+
+struct dm_verity_io {
+	struct dm_verity *v;
+	struct bio *bio;
+
+	/* original values of bio->bi_end_io and bio->bi_private */
+	bio_end_io_t *orig_bi_end_io;
+	void *orig_bi_private;
+
+	sector_t block;
+	unsigned n_blocks;
+
+	/* saved bio vector */
+	struct bio_vec *io_vec;
+	unsigned io_vec_size;
+
+	struct work_struct work;
+
+	/* A space for short vectors; longer vectors are allocated separately. */
+	struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
+
+	/*
+	 * Three variably-size fields follow this struct:
+	 *
+	 * u8 hash_desc[v->shash_descsize];
+	 * u8 real_digest[v->digest_size];
+	 * u8 want_digest[v->digest_size];
+	 *
+	 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
+	 */
+};
+
+static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
+{
+	return (struct shash_desc *)(io + 1);
+}
+
+static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+	return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+/*
+ * Auxiliary structure appended to each dm-bufio buffer. If the value
+ * hash_verified is nonzero, hash of the block has been verified.
+ *
+ * The variable hash_verified is set to 0 when allocating the buffer, then
+ * it can be changed to 1 and it is never reset to 0 again.
+ *
+ * There is no lock around this value, a race condition can at worst cause
+ * that multiple processes verify the hash of the same buffer simultaneously
+ * and write 1 to hash_verified simultaneously.
+ * This condition is harmless, so we don't need locking.
+ */
+struct buffer_aux {
+	int hash_verified;
+};
+
+/*
+ * Initialize struct buffer_aux for a freshly created buffer.
+ */
+static void dm_bufio_alloc_callback(struct dm_buffer *buf)
+{
+	struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+
+	aux->hash_verified = 0;
+}
+
+/*
+ * Translate input sector number to the sector number on the target device.
+ */
+static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
+{
+	return v->data_start + dm_target_offset(v->ti, bi_sector);
+}
+
+/*
+ * Return hash position of a specified block at a specified tree level
+ * (0 is the lowest level).
+ * The lowest "hash_per_block_bits"-bits of the result denote hash position
+ * inside a hash block. The remaining bits denote location of the hash block.
+ */
+static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
+					 int level)
+{
+	return block >> (level * v->hash_per_block_bits);
+}
+
+static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
+				 sector_t *hash_block, unsigned *offset)
+{
+	sector_t position = verity_position_at_level(v, block, level);
+	unsigned idx;
+
+	*hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
+
+	if (!offset)
+		return;
+
+	idx = position & ((1 << v->hash_per_block_bits) - 1);
+	if (!v->version)
+		*offset = idx * v->digest_size;
+	else
+		*offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
+}
+
+/*
+ * Verify hash of a metadata block pertaining to the specified data block
+ * ("block" argument) at a specified level ("level" argument).
+ *
+ * On successful return, io_want_digest(v, io) contains the hash value for
+ * a lower tree level or for the data block (if we're at the lowest leve).
+ *
+ * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
+ * If "skip_unverified" is false, unverified buffer is hashed and verified
+ * against current value of io_want_digest(v, io).
+ */
+static int verity_verify_level(struct dm_verity_io *io, sector_t block,
+			       int level, bool skip_unverified)
+{
+	struct dm_verity *v = io->v;
+	struct dm_buffer *buf;
+	struct buffer_aux *aux;
+	u8 *data;
+	int r;
+	sector_t hash_block;
+	unsigned offset;
+
+	verity_hash_at_level(v, block, level, &hash_block, &offset);
+
+	data = dm_bufio_read(v->bufio, hash_block, &buf);
+	if (unlikely(IS_ERR(data)))
+		return PTR_ERR(data);
+
+	aux = dm_bufio_get_aux_data(buf);
+
+	if (!aux->hash_verified) {
+		struct shash_desc *desc;
+		u8 *result;
+
+		if (skip_unverified) {
+			r = 1;
+			goto release_ret_r;
+		}
+
+		desc = io_hash_desc(v, io);
+		desc->tfm = v->tfm;
+		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+		r = crypto_shash_init(desc);
+		if (r < 0) {
+			DMERR("crypto_shash_init failed: %d", r);
+			goto release_ret_r;
+		}
+
+		if (likely(v->version >= 1)) {
+			r = crypto_shash_update(desc, v->salt, v->salt_size);
+			if (r < 0) {
+				DMERR("crypto_shash_update failed: %d", r);
+				goto release_ret_r;
+			}
+		}
+
+		r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
+		if (r < 0) {
+			DMERR("crypto_shash_update failed: %d", r);
+			goto release_ret_r;
+		}
+
+		if (!v->version) {
+			r = crypto_shash_update(desc, v->salt, v->salt_size);
+			if (r < 0) {
+				DMERR("crypto_shash_update failed: %d", r);
+				goto release_ret_r;
+			}
+		}
+
+		result = io_real_digest(v, io);
+		r = crypto_shash_final(desc, result);
+		if (r < 0) {
+			DMERR("crypto_shash_final failed: %d", r);
+			goto release_ret_r;
+		}
+		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+			DMERR_LIMIT("metadata block %llu is corrupted",
+				(unsigned long long)hash_block);
+			v->hash_failed = 1;
+			r = -EIO;
+			goto release_ret_r;
+		} else
+			aux->hash_verified = 1;
+	}
+
+	data += offset;
+
+	memcpy(io_want_digest(v, io), data, v->digest_size);
+
+	dm_bufio_release(buf);
+	return 0;
+
+release_ret_r:
+	dm_bufio_release(buf);
+
+	return r;
+}
+
+/*
+ * Verify one "dm_verity_io" structure.
+ */
+static int verity_verify_io(struct dm_verity_io *io)
+{
+	struct dm_verity *v = io->v;
+	unsigned b;
+	int i;
+	unsigned vector = 0, offset = 0;
+
+	for (b = 0; b < io->n_blocks; b++) {
+		struct shash_desc *desc;
+		u8 *result;
+		int r;
+		unsigned todo;
+
+		if (likely(v->levels)) {
+			/*
+			 * First, we try to get the requested hash for
+			 * the current block. If the hash block itself is
+			 * verified, zero is returned. If it isn't, this
+			 * function returns 0 and we fall back to whole
+			 * chain verification.
+			 */
+			int r = verity_verify_level(io, io->block + b, 0, true);
+			if (likely(!r))
+				goto test_block_hash;
+			if (r < 0)
+				return r;
+		}
+
+		memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
+
+		for (i = v->levels - 1; i >= 0; i--) {
+			int r = verity_verify_level(io, io->block + b, i, false);
+			if (unlikely(r))
+				return r;
+		}
+
+test_block_hash:
+		desc = io_hash_desc(v, io);
+		desc->tfm = v->tfm;
+		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+		r = crypto_shash_init(desc);
+		if (r < 0) {
+			DMERR("crypto_shash_init failed: %d", r);
+			return r;
+		}
+
+		if (likely(v->version >= 1)) {
+			r = crypto_shash_update(desc, v->salt, v->salt_size);
+			if (r < 0) {
+				DMERR("crypto_shash_update failed: %d", r);
+				return r;
+			}
+		}
+
+		todo = 1 << v->data_dev_block_bits;
+		do {
+			struct bio_vec *bv;
+			u8 *page;
+			unsigned len;
+
+			BUG_ON(vector >= io->io_vec_size);
+			bv = &io->io_vec[vector];
+			page = kmap_atomic(bv->bv_page);
+			len = bv->bv_len - offset;
+			if (likely(len >= todo))
+				len = todo;
+			r = crypto_shash_update(desc,
+					page + bv->bv_offset + offset, len);
+			kunmap_atomic(page);
+			if (r < 0) {
+				DMERR("crypto_shash_update failed: %d", r);
+				return r;
+			}
+			offset += len;
+			if (likely(offset == bv->bv_len)) {
+				offset = 0;
+				vector++;
+			}
+			todo -= len;
+		} while (todo);
+
+		if (!v->version) {
+			r = crypto_shash_update(desc, v->salt, v->salt_size);
+			if (r < 0) {
+				DMERR("crypto_shash_update failed: %d", r);
+				return r;
+			}
+		}
+
+		result = io_real_digest(v, io);
+		r = crypto_shash_final(desc, result);
+		if (r < 0) {
+			DMERR("crypto_shash_final failed: %d", r);
+			return r;
+		}
+		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+			DMERR_LIMIT("data block %llu is corrupted",
+				(unsigned long long)(io->block + b));
+			v->hash_failed = 1;
+			return -EIO;
+		}
+	}
+	BUG_ON(vector != io->io_vec_size);
+	BUG_ON(offset);
+
+	return 0;
+}
+
+/*
+ * End one "io" structure with a given error.
+ */
+static void verity_finish_io(struct dm_verity_io *io, int error)
+{
+	struct bio *bio = io->bio;
+	struct dm_verity *v = io->v;
+
+	bio->bi_end_io = io->orig_bi_end_io;
+	bio->bi_private = io->orig_bi_private;
+
+	if (io->io_vec != io->io_vec_inline)
+		mempool_free(io->io_vec, v->vec_mempool);
+
+	mempool_free(io, v->io_mempool);
+
+	bio_endio(bio, error);
+}
+
+static void verity_work(struct work_struct *w)
+{
+	struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
+
+	verity_finish_io(io, verity_verify_io(io));
+}
+
+static void verity_end_io(struct bio *bio, int error)
+{
+	struct dm_verity_io *io = bio->bi_private;
+
+	if (error) {
+		verity_finish_io(io, error);
+		return;
+	}
+
+	INIT_WORK(&io->work, verity_work);
+	queue_work(io->v->verify_wq, &io->work);
+}
+
+/*
+ * Prefetch buffers for the specified io.
+ * The root buffer is not prefetched, it is assumed that it will be cached
+ * all the time.
+ */
+static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+{
+	int i;
+
+	for (i = v->levels - 2; i >= 0; i--) {
+		sector_t hash_block_start;
+		sector_t hash_block_end;
+		verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
+		verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+		if (!i) {
+			unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
+
+			cluster >>= v->data_dev_block_bits;
+			if (unlikely(!cluster))
+				goto no_prefetch_cluster;
+
+			if (unlikely(cluster & (cluster - 1)))
+				cluster = 1 << (fls(cluster) - 1);
+
+			hash_block_start &= ~(sector_t)(cluster - 1);
+			hash_block_end |= cluster - 1;
+			if (unlikely(hash_block_end >= v->hash_blocks))
+				hash_block_end = v->hash_blocks - 1;
+		}
+no_prefetch_cluster:
+		dm_bufio_prefetch(v->bufio, hash_block_start,
+				  hash_block_end - hash_block_start + 1);
+	}
+}
+
+/*
+ * Bio map function. It allocates dm_verity_io structure and bio vector and
+ * fills them. Then it issues prefetches and the I/O.
+ */
+static int verity_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct dm_verity *v = ti->private;
+	struct dm_verity_io *io;
+
+	bio->bi_bdev = v->data_dev->bdev;
+	bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+
+	if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
+	    ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
+		DMERR_LIMIT("unaligned io");
+		return -EIO;
+	}
+
+	if ((bio->bi_sector + bio_sectors(bio)) >>
+	    (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
+		DMERR_LIMIT("io out of range");
+		return -EIO;
+	}
+
+	if (bio_data_dir(bio) == WRITE)
+		return -EIO;
+
+	io = mempool_alloc(v->io_mempool, GFP_NOIO);
+	io->v = v;
+	io->bio = bio;
+	io->orig_bi_end_io = bio->bi_end_io;
+	io->orig_bi_private = bio->bi_private;
+	io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+	io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
+
+	bio->bi_end_io = verity_end_io;
+	bio->bi_private = io;
+	io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
+	if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
+		io->io_vec = io->io_vec_inline;
+	else
+		io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
+	memcpy(io->io_vec, bio_iovec(bio),
+	       io->io_vec_size * sizeof(struct bio_vec));
+
+	verity_prefetch_io(v, io);
+
+	generic_make_request(bio);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+/*
+ * Status: V (valid) or C (corruption found)
+ */
+static int verity_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned maxlen)
+{
+	struct dm_verity *v = ti->private;
+	unsigned sz = 0;
+	unsigned x;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%c", v->hash_failed ? 'C' : 'V');
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT("%u %s %s %u %u %llu %llu %s ",
+			v->version,
+			v->data_dev->name,
+			v->hash_dev->name,
+			1 << v->data_dev_block_bits,
+			1 << v->hash_dev_block_bits,
+			(unsigned long long)v->data_blocks,
+			(unsigned long long)v->hash_start,
+			v->alg_name
+			);
+		for (x = 0; x < v->digest_size; x++)
+			DMEMIT("%02x", v->root_digest[x]);
+		DMEMIT(" ");
+		if (!v->salt_size)
+			DMEMIT("-");
+		else
+			for (x = 0; x < v->salt_size; x++)
+				DMEMIT("%02x", v->salt[x]);
+		break;
+	}
+
+	return 0;
+}
+
+static int verity_ioctl(struct dm_target *ti, unsigned cmd,
+			unsigned long arg)
+{
+	struct dm_verity *v = ti->private;
+	int r = 0;
+
+	if (v->data_start ||
+	    ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
+		r = scsi_verify_blk_ioctl(NULL, cmd);
+
+	return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
+				     cmd, arg);
+}
+
+static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size)
+{
+	struct dm_verity *v = ti->private;
+	struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = v->data_dev->bdev;
+	bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int verity_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_verity *v = ti->private;
+
+	return fn(ti, v->data_dev, v->data_start, ti->len, data);
+}
+
+static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct dm_verity *v = ti->private;
+
+	if (limits->logical_block_size < 1 << v->data_dev_block_bits)
+		limits->logical_block_size = 1 << v->data_dev_block_bits;
+
+	if (limits->physical_block_size < 1 << v->data_dev_block_bits)
+		limits->physical_block_size = 1 << v->data_dev_block_bits;
+
+	blk_limits_io_min(limits, limits->logical_block_size);
+}
+
+static void verity_dtr(struct dm_target *ti)
+{
+	struct dm_verity *v = ti->private;
+
+	if (v->verify_wq)
+		destroy_workqueue(v->verify_wq);
+
+	if (v->vec_mempool)
+		mempool_destroy(v->vec_mempool);
+
+	if (v->io_mempool)
+		mempool_destroy(v->io_mempool);
+
+	if (v->bufio)
+		dm_bufio_client_destroy(v->bufio);
+
+	kfree(v->salt);
+	kfree(v->root_digest);
+
+	if (v->tfm)
+		crypto_free_shash(v->tfm);
+
+	kfree(v->alg_name);
+
+	if (v->hash_dev)
+		dm_put_device(ti, v->hash_dev);
+
+	if (v->data_dev)
+		dm_put_device(ti, v->data_dev);
+
+	kfree(v);
+}
+
+/*
+ * Target parameters:
+ *	<version>	The current format is version 1.
+ *			Vsn 0 is compatible with original Chromium OS releases.
+ *	<data device>
+ *	<hash device>
+ *	<data block size>
+ *	<hash block size>
+ *	<the number of data blocks>
+ *	<hash start block>
+ *	<algorithm>
+ *	<digest>
+ *	<salt>		Hex string or "-" if no salt.
+ */
+static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct dm_verity *v;
+	unsigned num;
+	unsigned long long num_ll;
+	int r;
+	int i;
+	sector_t hash_position;
+	char dummy;
+
+	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
+	if (!v) {
+		ti->error = "Cannot allocate verity structure";
+		return -ENOMEM;
+	}
+	ti->private = v;
+	v->ti = ti;
+
+	if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
+		ti->error = "Device must be readonly";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (argc != 10) {
+		ti->error = "Invalid argument count: exactly 10 arguments required";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
+	    num < 0 || num > 1) {
+		ti->error = "Invalid version";
+		r = -EINVAL;
+		goto bad;
+	}
+	v->version = num;
+
+	r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+	if (r) {
+		ti->error = "Data device lookup failed";
+		goto bad;
+	}
+
+	r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+	if (r) {
+		ti->error = "Data device lookup failed";
+		goto bad;
+	}
+
+	if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
+	    !num || (num & (num - 1)) ||
+	    num < bdev_logical_block_size(v->data_dev->bdev) ||
+	    num > PAGE_SIZE) {
+		ti->error = "Invalid data device block size";
+		r = -EINVAL;
+		goto bad;
+	}
+	v->data_dev_block_bits = ffs(num) - 1;
+
+	if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
+	    !num || (num & (num - 1)) ||
+	    num < bdev_logical_block_size(v->hash_dev->bdev) ||
+	    num > INT_MAX) {
+		ti->error = "Invalid hash device block size";
+		r = -EINVAL;
+		goto bad;
+	}
+	v->hash_dev_block_bits = ffs(num) - 1;
+
+	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
+	    num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
+	    (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
+		ti->error = "Invalid data blocks";
+		r = -EINVAL;
+		goto bad;
+	}
+	v->data_blocks = num_ll;
+
+	if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
+		ti->error = "Data device is too small";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
+	    num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
+	    (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
+		ti->error = "Invalid hash start";
+		r = -EINVAL;
+		goto bad;
+	}
+	v->hash_start = num_ll;
+
+	v->alg_name = kstrdup(argv[7], GFP_KERNEL);
+	if (!v->alg_name) {
+		ti->error = "Cannot allocate algorithm name";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+	if (IS_ERR(v->tfm)) {
+		ti->error = "Cannot initialize hash function";
+		r = PTR_ERR(v->tfm);
+		v->tfm = NULL;
+		goto bad;
+	}
+	v->digest_size = crypto_shash_digestsize(v->tfm);
+	if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
+		ti->error = "Digest size too big";
+		r = -EINVAL;
+		goto bad;
+	}
+	v->shash_descsize =
+		sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+
+	v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
+	if (!v->root_digest) {
+		ti->error = "Cannot allocate root digest";
+		r = -ENOMEM;
+		goto bad;
+	}
+	if (strlen(argv[8]) != v->digest_size * 2 ||
+	    hex2bin(v->root_digest, argv[8], v->digest_size)) {
+		ti->error = "Invalid root digest";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (strcmp(argv[9], "-")) {
+		v->salt_size = strlen(argv[9]) / 2;
+		v->salt = kmalloc(v->salt_size, GFP_KERNEL);
+		if (!v->salt) {
+			ti->error = "Cannot allocate salt";
+			r = -ENOMEM;
+			goto bad;
+		}
+		if (strlen(argv[9]) != v->salt_size * 2 ||
+		    hex2bin(v->salt, argv[9], v->salt_size)) {
+			ti->error = "Invalid salt";
+			r = -EINVAL;
+			goto bad;
+		}
+	}
+
+	v->hash_per_block_bits =
+		fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
+
+	v->levels = 0;
+	if (v->data_blocks)
+		while (v->hash_per_block_bits * v->levels < 64 &&
+		       (unsigned long long)(v->data_blocks - 1) >>
+		       (v->hash_per_block_bits * v->levels))
+			v->levels++;
+
+	if (v->levels > DM_VERITY_MAX_LEVELS) {
+		ti->error = "Too many tree levels";
+		r = -E2BIG;
+		goto bad;
+	}
+
+	hash_position = v->hash_start;
+	for (i = v->levels - 1; i >= 0; i--) {
+		sector_t s;
+		v->hash_level_block[i] = hash_position;
+		s = verity_position_at_level(v, v->data_blocks, i);
+		s = (s >> v->hash_per_block_bits) +
+		    !!(s & ((1 << v->hash_per_block_bits) - 1));
+		if (hash_position + s < hash_position) {
+			ti->error = "Hash device offset overflow";
+			r = -E2BIG;
+			goto bad;
+		}
+		hash_position += s;
+	}
+	v->hash_blocks = hash_position;
+
+	v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
+		1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
+		dm_bufio_alloc_callback, NULL);
+	if (IS_ERR(v->bufio)) {
+		ti->error = "Cannot initialize dm-bufio";
+		r = PTR_ERR(v->bufio);
+		v->bufio = NULL;
+		goto bad;
+	}
+
+	if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
+		ti->error = "Hash device is too small";
+		r = -E2BIG;
+		goto bad;
+	}
+
+	v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+	  sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
+	if (!v->io_mempool) {
+		ti->error = "Cannot allocate io mempool";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+					BIO_MAX_PAGES * sizeof(struct bio_vec));
+	if (!v->vec_mempool) {
+		ti->error = "Cannot allocate vector mempool";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	/* WQ_UNBOUND greatly improves performance when running on ramdisk */
+	v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
+	if (!v->verify_wq) {
+		ti->error = "Cannot allocate workqueue";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	return 0;
+
+bad:
+	verity_dtr(ti);
+
+	return r;
+}
+
+static struct target_type verity_target = {
+	.name		= "verity",
+	.version	= {1, 0, 0},
+	.module		= THIS_MODULE,
+	.ctr		= verity_ctr,
+	.dtr		= verity_dtr,
+	.map		= verity_map,
+	.status		= verity_status,
+	.ioctl		= verity_ioctl,
+	.merge		= verity_merge,
+	.iterate_devices = verity_iterate_devices,
+	.io_hints	= verity_io_hints,
+};
+
+static int __init dm_verity_init(void)
+{
+	int r;
+
+	r = dm_register_target(&verity_target);
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_verity_exit(void)
+{
+	dm_unregister_target(&verity_target);
+}
+
+module_init(dm_verity_init);
+module_exit(dm_verity_exit);
+
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
+MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548ec3f..e24143cc204 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/*
 		 * Store bio_set for cleanup.
 		 */
+		clone->bi_end_io = NULL;
 		clone->bi_private = md->bs;
 		bio_put(clone);
 		free_tio(md, tio);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb4..45135f69509 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
 	}
 	conf->nfaults = 0;
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		conf->rdev = rdev;
 
 	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb..fa211d80fc0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	struct dev_info *dev0;
 	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+	int maxbytes = biovec->bv_len;
+	struct request_queue *subq;
 
 	rcu_read_lock();
 	dev0 = which_dev(mddev, sector);
 	maxsectors = dev0->end_sector - sector;
+	subq = bdev_get_queue(dev0->rdev->bdev);
+	if (subq->merge_bvec_fn) {
+		bvm->bi_bdev = dev0->rdev->bdev;
+		bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
+		maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
+							     biovec));
+	}
 	rcu_read_unlock();
 
 	if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
 		maxsectors -= bio_sectors;
 
 	if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
-		return biovec->bv_len;
-	/* The bytes available at this offset could be really big,
-	 * so we cap at 2^31 to avoid overflow */
-	if (maxsectors > (1 << (31-9)))
-		return 1<<31;
-	return maxsectors << 9;
+		return maxbytes;
+
+	if (maxsectors > (maxbytes >> 9))
+		return maxbytes;
+	else
+		return maxsectors << 9;
 }
 
 static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	cnt = 0;
 	conf->array_sectors = 0;
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		int j = rdev->raid_disk;
 		struct dev_info *disk = conf->disks + j;
 		sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit max_segments to 1 lying within
-		 * a single page.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
 
 		conf->array_sectors += rdev->sectors;
 		cnt++;
@@ -198,6 +198,7 @@ out:
 static int linear_run (struct mddev *mddev)
 {
 	struct linear_conf *conf;
+	int ret;
 
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
@@ -211,7 +212,13 @@ static int linear_run (struct mddev *mddev)
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	return md_integrity_register(mddev);
+
+	ret =  md_integrity_register(mddev);
+	if (ret) {
+		kfree(conf);
+		mddev->private = NULL;
+	}
+	return ret;
 }
 
 static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755baf4..b572e1e386c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
 	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
 	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
-	list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			/* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
 {
 	struct md_rdev *rdev;
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		if (rdev->desc_nr == nr)
 			return rdev;
 
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
 {
 	struct md_rdev *rdev;
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		if (rdev->bdev->bd_dev == dev)
 			return rdev;
 
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 
 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
-	list_for_each_entry(rdev2, &mddev->disks, same_set) {
+	rdev_for_each(rdev2, mddev) {
 		mdp_disk_t *d;
 		int desc_nr;
 		int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
 						| BB_LEN(internal_bb));
 				*bbp++ = cpu_to_le64(store_bb);
 			}
+			bb->changed = 0;
 			if (read_seqretry(&bb->lock, seq))
 				goto retry;
 
 			bb->sector = (rdev->sb_start +
 				      (int)le32_to_cpu(sb->bblog_offset));
 			bb->size = le16_to_cpu(sb->bblog_size);
-			bb->changed = 0;
 		}
 	}
 
 	max_dev = 0;
-	list_for_each_entry(rdev2, &mddev->disks, same_set)
+	rdev_for_each(rdev2, mddev)
 		if (rdev2->desc_nr+1 > max_dev)
 			max_dev = rdev2->desc_nr+1;
 
@@ -1833,7 +1833,7 @@ retry:
 	for (i=0; i<max_dev;i++)
 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
 	
-	list_for_each_entry(rdev2, &mddev->disks, same_set) {
+	rdev_for_each(rdev2, mddev) {
 		i = rdev2->desc_nr;
 		if (test_bit(Faulty, &rdev2->flags))
 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
 		return 0; /* nothing to do */
 	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
 		return 0; /* shouldn't register, or already is */
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		/* skip spares and non-functional disks */
 		if (test_bit(Faulty, &rdev->flags))
 			continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
 {
 	struct md_rdev *rdev, *tmp;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	rdev_for_each_safe(rdev, tmp, mddev) {
 		if (!rdev->mddev) {
 			MD_BUG();
 			continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
 			bitmap_print_sb(mddev->bitmap);
 		else
 			printk("%s: ", mdname(mddev));
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev_for_each(rdev, mddev)
 			printk("<%s>", bdevname(rdev->bdev,b));
 		printk("\n");
 
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev_for_each(rdev, mddev)
 			print_rdev(rdev, mddev->major_version);
 	}
 	printk("md:	**********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
 	 * with the rest of the array)
 	 */
 	struct md_rdev *rdev;
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (rdev->sb_events == mddev->events ||
 		    (nospares &&
 		     rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
 
 repeat:
 	/* First make sure individual recovery_offsets are correct */
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk >= 0 &&
 		    mddev->delta_disks >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
 		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
 		if (!mddev->external) {
 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-			list_for_each_entry(rdev, &mddev->disks, same_set) {
+			rdev_for_each(rdev, mddev) {
 				if (rdev->badblocks.changed) {
+					rdev->badblocks.changed = 0;
 					md_ack_all_badblocks(&rdev->badblocks);
 					md_error(mddev, rdev);
 				}
@@ -2430,7 +2431,7 @@ repeat:
 		mddev->events --;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (rdev->badblocks.changed)
 			any_badblocks_changed++;
 		if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
 		 mdname(mddev), mddev->in_sync);
 
 	bitmap_update_sb(mddev->bitmap);
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		char b[BDEVNAME_SIZE];
 
 		if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
 			clear_bit(Blocked, &rdev->flags);
 
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 			struct md_rdev *rdev2;
 
 			mddev_lock(mddev);
-			list_for_each_entry(rdev2, &mddev->disks, same_set)
+			rdev_for_each(rdev2, mddev)
 				if (rdev->bdev == rdev2->bdev &&
 				    rdev != rdev2 &&
 				    overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
 	char b[BDEVNAME_SIZE];
 
 	freshest = NULL;
-	rdev_for_each(rdev, tmp, mddev)
+	rdev_for_each_safe(rdev, tmp, mddev)
 		switch (super_types[mddev->major_version].
 			load_super(rdev, freshest, mddev->minor_version)) {
 		case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
 		validate_super(mddev, freshest);
 
 	i = 0;
-	rdev_for_each(rdev, tmp, mddev) {
+	rdev_for_each_safe(rdev, tmp, mddev) {
 		if (mddev->max_disks &&
 		    (rdev->desc_nr >= mddev->max_disks ||
 		     i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EINVAL;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		rdev->new_raid_disk = rdev->raid_disk;
 
 	/* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->safemode = 0;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk < 0)
 			continue;
 		if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 			continue;
 		sysfs_unlink_rdev(mddev, rdev);
 	}
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk < 0)
 			continue;
 		if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev)
 	 * the only valid external interface is through the md
 	 * device.
 	 */
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (test_bit(Faulty, &rdev->flags))
 			continue;
 		sync_blockdev(rdev->bdev);
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev)
 		struct md_rdev *rdev2;
 		int warned = 0;
 
-		list_for_each_entry(rdev, &mddev->disks, same_set)
-			list_for_each_entry(rdev2, &mddev->disks, same_set) {
+		rdev_for_each(rdev, mddev)
+			rdev_for_each(rdev2, mddev) {
 				if (rdev < rdev2 &&
 				    rdev->bdev->bd_contains ==
 				    rdev2->bdev->bd_contains) {
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev)
 	mddev->in_sync = 1;
 	smp_wmb();
 	mddev->ready = 1;
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		if (rdev->raid_disk >= 0)
 			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->changed = 0;
 	mddev->degraded = 0;
 	mddev->safemode = 0;
+	mddev->merge_check_needed = 0;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
 	mddev->bitmap_info.chunksize = 0;
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
 		/* tell userspace to handle 'inactive' */
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk >= 0)
 				sysfs_unlink_rdev(mddev, rdev);
 
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
 
 	printk(KERN_INFO "md: running: ");
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		char b[BDEVNAME_SIZE];
 		printk("<%s>", bdevname(rdev->bdev,b));
 	}
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
 	struct md_rdev *rdev;
 
 	nr=working=insync=failed=spare=0;
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		nr++;
 		if (test_bit(Faulty, &rdev->flags))
 			failed++;
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 		 * grow, and re-add.
 		 */
 		return -EBUSY;
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		sector_t avail = rdev->sectors;
 
 		if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
 	struct mddev *mddev = v;
 	sector_t sectors;
 	struct md_rdev *rdev;
-	struct bitmap *bitmap;
 
 	if (v == (void*)1) {
 		struct md_personality *pers;
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		}
 
 		sectors = 0;
-		list_for_each_entry(rdev, &mddev->disks, same_set) {
+		rdev_for_each(rdev, mddev) {
 			char b[BDEVNAME_SIZE];
 			seq_printf(seq, " %s[%d]",
 				bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		} else
 			seq_printf(seq, "\n       ");
 
-		if ((bitmap = mddev->bitmap)) {
-			unsigned long chunk_kb;
-			unsigned long flags;
-			spin_lock_irqsave(&bitmap->lock, flags);
-			chunk_kb = mddev->bitmap_info.chunksize >> 10;
-			seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
-				"%lu%s chunk",
-				bitmap->pages - bitmap->missing_pages,
-				bitmap->pages,
-				(bitmap->pages - bitmap->missing_pages)
-					<< (PAGE_SHIFT - 10),
-				chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
-				chunk_kb ? "KB" : "B");
-			if (bitmap->file) {
-				seq_printf(seq, ", file: ");
-				seq_path(seq, &bitmap->file->f_path, " \t\n");
-			}
-
-			seq_printf(seq, "\n");
-			spin_unlock_irqrestore(&bitmap->lock, flags);
-		}
+		bitmap_status(seq, mddev->bitmap);
 
 		seq_printf(seq, "\n");
 	}
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
 		max_sectors = mddev->dev_sectors;
 		j = MaxSector;
 		rcu_read_lock();
-		list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+		rdev_for_each_rcu(rdev, mddev)
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(Faulty, &rdev->flags) &&
 			    !test_bit(In_sync, &rdev->flags) &&
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 				mddev->curr_resync = MaxSector;
 			rcu_read_lock();
-			list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+			rdev_for_each_rcu(rdev, mddev)
 				if (rdev->raid_disk >= 0 &&
 				    mddev->delta_disks >= 0 &&
 				    !test_bit(Faulty, &rdev->flags) &&
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev)
 
 	mddev->curr_resync_completed = 0;
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Blocked, &rdev->flags) &&
 		    (test_bit(Faulty, &rdev->flags) ||
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev)
 			     "degraded");
 
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags))
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
 	 * do the superblock for an incrementally recovered device
 	 * written out.
 	 */
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		if (!mddev->degraded ||
 		    test_bit(In_sync, &rdev->flags))
 			rdev->saved_raid_disk = -1;
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
 			 * failed devices.
 			 */
 			struct md_rdev *rdev;
-			list_for_each_entry(rdev, &mddev->disks, same_set)
+			rdev_for_each(rdev, mddev)
 				if (rdev->raid_disk >= 0 &&
 				    !test_bit(Blocked, &rdev->flags) &&
 				    test_bit(Faulty, &rdev->flags) &&
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
 		return;
 	write_seqlock_irq(&bb->lock);
 
-	if (bb->changed == 0) {
+	if (bb->changed == 0 && bb->unacked_exist) {
 		u64 *p = bb->page;
 		int i;
 		for (i = 0; i < bb->count ; i++) {
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
 	struct mddev *mddev;
 	int need_delay = 0;
 
-	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
-
-		printk(KERN_INFO "md: stopping all md devices.\n");
-
-		for_each_mddev(mddev, tmp) {
-			if (mddev_trylock(mddev)) {
-				/* Force a switch to readonly even array
-				 * appears to still be in use.  Hence
-				 * the '100'.
-				 */
-				md_set_readonly(mddev, 100);
-				mddev_unlock(mddev);
-			}
-			need_delay = 1;
+	for_each_mddev(mddev, tmp) {
+		if (mddev_trylock(mddev)) {
+			__md_stop_writes(mddev);
+			mddev->safemode = 2;
+			mddev_unlock(mddev);
 		}
-		/*
-		 * certain more exotic SCSI devices are known to be
-		 * volatile wrt too early system reboots. While the
-		 * right place to handle this issue is the given
-		 * driver, we do want to have a safe RAID driver ...
-		 */
-		if (need_delay)
-			mdelay(1000*1);
+		need_delay = 1;
 	}
+	/*
+	 * certain more exotic SCSI devices are known to be
+	 * volatile wrt too early system reboots. While the
+	 * right place to handle this issue is the given
+	 * driver, we do want to have a safe RAID driver ...
+	 */
+	if (need_delay)
+		mdelay(1000*1);
+
 	return NOTIFY_DONE;
 }
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2..1c2063ccf48 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
 	In_sync,		/* device is in_sync with rest of array */
+	Unmerged,		/* device is being added to array and should
+				 * be considerred for bvec_merge_fn but not
+				 * yet for actual IO
+				 */
 	WriteMostly,		/* Avoid reading if at all possible */
 	AutoDetected,		/* added by auto-detect */
 	Blocked,		/* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
+	int				merge_check_needed; /* at least one
+							     * member device
+							     * has a
+							     * merge_bvec_fn */
 
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 /*
  * iterates through the 'same array disks' ringlist
  */
-#define rdev_for_each(rdev, tmp, mddev)				\
+#define rdev_for_each(rdev, mddev)				\
+	list_for_each_entry(rdev, &((mddev)->disks), same_set)
+
+#define rdev_for_each_safe(rdev, tmp, mddev)				\
 	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
 
 #define rdev_for_each_rcu(rdev, mddev)				\
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660..9339e67fcc7 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
 	}
 
 	working_disks = 0;
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0 ||
 		    disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f..5709bfeab1e 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
 	return &n->keys[le32_to_cpu(n->header.max_entries)];
 }
 
-/*
- * FIXME: Now that value size is stored in node we don't need the third parm.
- */
-static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
+static inline void *value_ptr(struct node *n, uint32_t index)
 {
-	BUG_ON(value_size != le32_to_cpu(n->header.value_size));
+	uint32_t value_size = le32_to_cpu(n->header.value_size);
 	return value_base(n) + (value_size * index);
 }
 
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389..aa71e2359a0 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
 	if (shift < 0) {
 		shift = -shift;
 		BUG_ON(shift > nr_entries);
-		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
+		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 		memmove(key_ptr(n, 0),
 			key_ptr(n, shift),
 			(nr_entries - shift) * sizeof(__le64));
-		memmove(value_ptr(n, 0, value_size),
-			value_ptr(n, shift, value_size),
+		memmove(value_ptr(n, 0),
+			value_ptr(n, shift),
 			(nr_entries - shift) * value_size);
 	} else {
 		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 		memmove(key_ptr(n, shift),
 			key_ptr(n, 0),
 			nr_entries * sizeof(__le64));
-		memmove(value_ptr(n, shift, value_size),
-			value_ptr(n, 0, value_size),
+		memmove(value_ptr(n, shift),
+			value_ptr(n, 0),
 			nr_entries * value_size);
 	}
 }
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
 		memcpy(key_ptr(left, nr_left),
 		       key_ptr(right, 0),
 		       shift * sizeof(__le64));
-		memcpy(value_ptr(left, nr_left, value_size),
-		       value_ptr(right, 0, value_size),
+		memcpy(value_ptr(left, nr_left),
+		       value_ptr(right, 0),
 		       shift * value_size);
 	} else {
 		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
 		memcpy(key_ptr(right, 0),
 		       key_ptr(left, nr_left - shift),
 		       shift * sizeof(__le64));
-		memcpy(value_ptr(right, 0, value_size),
-		       value_ptr(left, nr_left - shift, value_size),
+		memcpy(value_ptr(right, 0),
+		       value_ptr(left, nr_left - shift),
 		       shift * value_size);
 	}
 }
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index)
 			key_ptr(n, index + 1),
 			nr_to_copy * sizeof(__le64));
 
-		memmove(value_ptr(n, index, value_size),
-			value_ptr(n, index + 1, value_size),
+		memmove(value_ptr(n, index),
+			value_ptr(n, index + 1),
 			nr_to_copy * value_size);
 	}
 
 	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 }
 
-static unsigned del_threshold(struct node *n)
-{
-	return le32_to_cpu(n->header.max_entries) / 3;
-}
-
 static unsigned merge_threshold(struct node *n)
 {
-	/*
-	 * The extra one is because we know we're potentially going to
-	 * delete an entry.
-	 */
-	return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
+	return le32_to_cpu(n->header.max_entries) / 3;
 }
 
 struct child {
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
 	if (inc)
 		inc_children(info->tm, result->n, &le64_type);
 
-	*((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
+	*((__le64 *) value_ptr(parent, index)) =
 		cpu_to_le64(dm_block_location(result->block));
 
 	return 0;
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
 
 static void shift(struct node *left, struct node *right, int count)
 {
+	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+
+	BUG_ON(max_entries != r_max_entries);
+	BUG_ON(nr_left - count > max_entries);
+	BUG_ON(nr_right + count > max_entries);
+
 	if (!count)
 		return;
 
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
 		node_shift(right, count);
 	}
 
-	left->header.nr_entries =
-		cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
-	BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
-
-	right->header.nr_entries =
-		cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
-	BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
+	left->header.nr_entries = cpu_to_le32(nr_left - count);
+	right->header.nr_entries = cpu_to_le32(nr_right + count);
 }
 
 static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
 	struct node *right = r->n;
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+	unsigned threshold = 2 * merge_threshold(left) + 1;
 
-	if (nr_left + nr_right <= merge_threshold(left)) {
+	if (nr_left + nr_right < threshold) {
 		/*
 		 * Merge
 		 */
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
 		 * Rebalance.
 		 */
 		unsigned target_left = (nr_left + nr_right) / 2;
-		unsigned shift_ = nr_left - target_left;
-		BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
-		BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
 		shift(left, right, nr_left - target_left);
 		*key_ptr(parent, r->index) = right->keys[0];
 	}
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
 	return exit_child(info, &right);
 }
 
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2.  This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+			       struct child *l, struct child *c, struct child *r,
+			       struct node *left, struct node *center, struct node *right,
+			       uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+	unsigned shift = min(max_entries - nr_left, nr_center);
+
+	BUG_ON(nr_left + shift > max_entries);
+	node_copy(left, center, -shift);
+	left->header.nr_entries = cpu_to_le32(nr_left + shift);
+
+	if (shift != nr_center) {
+		shift = nr_center - shift;
+		BUG_ON((nr_right + shift) > max_entries);
+		node_shift(right, shift);
+		node_copy(center, right, shift);
+		right->header.nr_entries = cpu_to_le32(nr_right + shift);
+	}
+	*key_ptr(parent, r->index) = right->keys[0];
+
+	delete_at(parent, c->index);
+	r->index--;
+
+	dm_tm_dec(info->tm, dm_block_location(c->block));
+	__rebalance2(info, parent, l, r);
+}
+
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct node *parent,
+			  struct child *l, struct child *c, struct child *r,
+			  struct node *left, struct node *center, struct node *right,
+			  uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+	int s;
+	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+	unsigned target = (nr_left + nr_center + nr_right) / 3;
+	BUG_ON(target > max_entries);
+
+	if (nr_left < nr_right) {
+		s = nr_left - target;
+
+		if (s < 0 && nr_center < -s) {
+			/* not enough in central node */
+			shift(left, center, nr_center);
+			s = nr_center - target;
+			shift(left, right, s);
+			nr_right += s;
+		} else
+			shift(left, center, s);
+
+		shift(center, right, target - nr_right);
+
+	} else {
+		s = target - nr_right;
+		if (s > 0 && nr_center < s) {
+			/* not enough in central node */
+			shift(center, right, nr_center);
+			s = target - nr_center;
+			shift(left, right, s);
+			nr_left -= s;
+		} else
+			shift(center, right, s);
+
+		shift(left, center, nr_left - target);
+	}
+
+	*key_ptr(parent, c->index) = center->keys[0];
+	*key_ptr(parent, r->index) = right->keys[0];
+}
+
 static void __rebalance3(struct dm_btree_info *info, struct node *parent,
 			 struct child *l, struct child *c, struct child *r)
 {
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
-	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 
-	unsigned target;
+	unsigned threshold = merge_threshold(left) * 4 + 1;
 
 	BUG_ON(left->header.max_entries != center->header.max_entries);
 	BUG_ON(center->header.max_entries != right->header.max_entries);
 
-	if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
-		/*
-		 * Delete center node:
-		 *
-		 * We dump as many entries from center as possible into
-		 * left, then the rest in right, then rebalance2.  This
-		 * wastes some cpu, but I want something simple atm.
-		 */
-		unsigned shift = min(max_entries - nr_left, nr_center);
-
-		BUG_ON(nr_left + shift > max_entries);
-		node_copy(left, center, -shift);
-		left->header.nr_entries = cpu_to_le32(nr_left + shift);
-
-		if (shift != nr_center) {
-			shift = nr_center - shift;
-			BUG_ON((nr_right + shift) >= max_entries);
-			node_shift(right, shift);
-			node_copy(center, right, shift);
-			right->header.nr_entries = cpu_to_le32(nr_right + shift);
-		}
-		*key_ptr(parent, r->index) = right->keys[0];
-
-		delete_at(parent, c->index);
-		r->index--;
-
-		dm_tm_dec(info->tm, dm_block_location(c->block));
-		__rebalance2(info, parent, l, r);
-
-		return;
-	}
-
-	/*
-	 * Rebalance
-	 */
-	target = (nr_left + nr_center + nr_right) / 3;
-	BUG_ON(target > max_entries);
-
-	/*
-	 * Adjust the left node
-	 */
-	shift(left, center, nr_left - target);
-
-	/*
-	 * Adjust the right node
-	 */
-	shift(center, right, target - nr_right);
-	*key_ptr(parent, c->index) = center->keys[0];
-	*key_ptr(parent, r->index) = right->keys[0];
+	if ((nr_left + nr_center + nr_right) < threshold)
+		delete_center_node(info, parent, l, c, r, left, center, right,
+				   nr_left, nr_center, nr_right);
+	else
+		redistribute3(info, parent, l, c, r, left, center, right,
+			      nr_left, nr_center, nr_right);
 }
 
 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
 	if (r)
 		return r;
 
-	if (child_entries > del_threshold(n))
-		return 0;
-
 	has_left_sibling = i > 0;
 	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		 */
 		if (shadow_has_parent(s)) {
 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
-			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
+			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
 			       &location, sizeof(__le64));
 		}
 
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 
 		if (info->value_type.dec)
 			info->value_type.dec(info->value_type.context,
-					     value_ptr(n, index, info->value_type.size));
+					     value_ptr(n, index));
 
 		delete_at(n, index);
 	}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26..d12b2cc51f1 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
 			dm_tm_inc(tm, value64(n, i));
 	else if (vt->inc)
 		for (i = 0; i < nr_entries; i++)
-			vt->inc(vt->context,
-				value_ptr(n, i, vt->size));
+			vt->inc(vt->context, value_ptr(n, i));
 }
 
 static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 
 				for (i = 0; i < f->nr_children; i++)
 					info->value_type.dec(info->value_type.context,
-							     value_ptr(f->n, i, info->value_type.size));
+							     value_ptr(f->n, i));
 			}
 			f->current_child = f->nr_children;
 		}
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
 	} while (!(flags & LEAF_NODE));
 
 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
-	memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
+	memcpy(v, value_ptr(ro_node(s), i), value_size);
 
 	return 0;
 }
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 
 	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
 		sizeof(uint64_t) : s->info->value_type.size;
-	memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
+	memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
 	       size * nr_right);
 
 	/*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 	pn = dm_block_data(parent);
 	location = cpu_to_le64(dm_block_location(left));
 	__dm_bless_for_disk(&location);
-	memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
+	memcpy_disk(value_ptr(pn, parent_index),
 		    &location, sizeof(__le64));
 
 	location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 
 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
 		sizeof(__le64) : s->info->value_type.size;
-	memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
-	memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
+	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
 	       nr_right * size);
 
 	/* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 	val = cpu_to_le64(dm_block_location(left));
 	__dm_bless_for_disk(&val);
 	pn->keys[0] = ln->keys[0];
-	memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
+	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
 
 	val = cpu_to_le64(dm_block_location(right));
 	__dm_bless_for_disk(&val);
 	pn->keys[1] = rn->keys[0];
-	memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
+	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
 
 	/*
 	 * rejig the spine.  This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
 			__dm_bless_for_disk(&location);
-			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
+			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
 				    &location, sizeof(__le64));
 		}
 
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 		    (!info->value_type.equal ||
 		     !info->value_type.equal(
 			     info->value_type.context,
-			     value_ptr(n, index, info->value_type.size),
+			     value_ptr(n, index),
 			     value))) {
 			info->value_type.dec(info->value_type.context,
-					     value_ptr(n, index, info->value_type.size));
+					     value_ptr(n, index));
 		}
-		memcpy_disk(value_ptr(n, index, info->value_type.size),
+		memcpy_disk(value_ptr(n, index),
 			    value, info->value_type.size);
 	}
 
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cd..ff3beed6ad2 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 		if (r < 0)
 			return r;
 
-#if 0
-		/* FIXME: dm_btree_remove doesn't handle this yet */
 		if (old > 2) {
 			r = dm_btree_remove(&ll->ref_count_info,
 					    ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 			if (r)
 				return r;
 		}
-#endif
 
 	} else {
 		__le32 le_rc = cpu_to_le32(ref_count);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e3..de63a1fc373 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 
 	if (!conf)
 		return -ENOMEM;
-	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+	rdev_for_each(rdev1, mddev) {
 		pr_debug("md/raid0:%s: looking at %s\n",
 			 mdname(mddev),
 			 bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		sector_div(sectors, mddev->chunk_sectors);
 		rdev1->sectors = sectors * mddev->chunk_sectors;
 
-		list_for_each_entry(rdev2, &mddev->disks, same_set) {
+		rdev_for_each(rdev2, mddev) {
 			pr_debug("md/raid0:%s:   comparing %s(%llu)"
 				 " with %s(%llu)\n",
 				 mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	smallest = NULL;
 	dev = conf->devlist;
 	err = -EINVAL;
-	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+	rdev_for_each(rdev1, mddev) {
 		int j = rdev1->raid_disk;
 
 		if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 
 		disk_stack_limits(mddev->gendisk, rdev1->bdev,
 				  rdev1->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_segments to 1, lying within
-		 * a single page.
-		 */
 
-		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
+		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
+			conf->has_merge_bvec = 1;
+
 		if (!smallest || (rdev1->sectors < smallest->sectors))
 			smallest = rdev1;
 		cnt++;
@@ -290,8 +284,64 @@ abort:
 	return err;
 }
 
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct r0conf *conf,
+				    sector_t *sectorp)
+{
+	int i;
+	struct strip_zone *z = conf->strip_zone;
+	sector_t sector = *sectorp;
+
+	for (i = 0; i < conf->nr_strip_zones; i++)
+		if (sector < z[i].zone_end) {
+			if (i)
+				*sectorp = sector - z[i-1].zone_end;
+			return z + i;
+		}
+	BUG();
+}
+
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+				sector_t sector, sector_t *sector_offset)
+{
+	unsigned int sect_in_chunk;
+	sector_t chunk;
+	struct r0conf *conf = mddev->private;
+	int raid_disks = conf->strip_zone[0].nb_dev;
+	unsigned int chunk_sects = mddev->chunk_sectors;
+
+	if (is_power_of_2(chunk_sects)) {
+		int chunksect_bits = ffz(~chunk_sects);
+		/* find the sector offset inside the chunk */
+		sect_in_chunk  = sector & (chunk_sects - 1);
+		sector >>= chunksect_bits;
+		/* chunk in zone */
+		chunk = *sector_offset;
+		/* quotient is the chunk in real device*/
+		sector_div(chunk, zone->nb_dev << chunksect_bits);
+	} else{
+		sect_in_chunk = sector_div(sector, chunk_sects);
+		chunk = *sector_offset;
+		sector_div(chunk, chunk_sects * zone->nb_dev);
+	}
+	/*
+	*  position the bio over the real device
+	*  real sector = chunk in device + starting of zone
+	*	+ the position in the chunk
+	*/
+	*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+	return conf->devlist[(zone - conf->strip_zone)*raid_disks
+			     + sector_div(sector, zone->nb_dev)];
+}
+
 /**
- *	raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *	raid0_mergeable_bvec -- tell bio layer if two requests can be merged
  *	@q: request queue
  *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 				struct bio_vec *biovec)
 {
 	struct mddev *mddev = q->queuedata;
+	struct r0conf *conf = mddev->private;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+	sector_t sector_offset = sector;
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
+	struct strip_zone *zone;
+	struct md_rdev *rdev;
+	struct request_queue *subq;
 
 	if (is_power_of_2(chunk_sectors))
 		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 	else
 		max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
 						+ bio_sectors)) << 9;
-	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+	if (max < 0)
+		max = 0; /* bio_add cannot handle a negative return */
 	if (max <= biovec->bv_len && bio_sectors == 0)
 		return biovec->bv_len;
-	else 
+	if (max < biovec->bv_len)
+		/* too small already, no need to check further */
+		return max;
+	if (!conf->has_merge_bvec)
+		return max;
+
+	/* May need to check subordinate device */
+	sector = sector_offset;
+	zone = find_zone(mddev->private, &sector_offset);
+	rdev = map_sector(mddev, zone, sector, &sector_offset);
+	subq = bdev_get_queue(rdev->bdev);
+	if (subq->merge_bvec_fn) {
+		bvm->bi_bdev = rdev->bdev;
+		bvm->bi_sector = sector_offset + zone->dev_start +
+			rdev->data_offset;
+		return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
+	} else
 		return max;
 }
 
@@ -329,12 +401,14 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
 	WARN_ONCE(sectors || raid_disks,
 		  "%s does not support generic reshape\n", __func__);
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		array_sectors += rdev->sectors;
 
 	return array_sectors;
 }
 
+static int raid0_stop(struct mddev *mddev);
+
 static int raid0_run(struct mddev *mddev)
 {
 	struct r0conf *conf;
@@ -382,7 +456,12 @@ static int raid0_run(struct mddev *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
 	dump_zones(mddev);
-	return md_integrity_register(mddev);
+
+	ret = md_integrity_register(mddev);
+	if (ret)
+		raid0_stop(mddev);
+
+	return ret;
 }
 
 static int raid0_stop(struct mddev *mddev)
@@ -397,62 +476,6 @@ static int raid0_stop(struct mddev *mddev)
 	return 0;
 }
 
-/* Find the zone which holds a particular offset
- * Update *sectorp to be an offset in that zone
- */
-static struct strip_zone *find_zone(struct r0conf *conf,
-				    sector_t *sectorp)
-{
-	int i;
-	struct strip_zone *z = conf->strip_zone;
-	sector_t sector = *sectorp;
-
-	for (i = 0; i < conf->nr_strip_zones; i++)
-		if (sector < z[i].zone_end) {
-			if (i)
-				*sectorp = sector - z[i-1].zone_end;
-			return z + i;
-		}
-	BUG();
-}
-
-/*
- * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
-*/
-static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
-				sector_t sector, sector_t *sector_offset)
-{
-	unsigned int sect_in_chunk;
-	sector_t chunk;
-	struct r0conf *conf = mddev->private;
-	int raid_disks = conf->strip_zone[0].nb_dev;
-	unsigned int chunk_sects = mddev->chunk_sectors;
-
-	if (is_power_of_2(chunk_sects)) {
-		int chunksect_bits = ffz(~chunk_sects);
-		/* find the sector offset inside the chunk */
-		sect_in_chunk  = sector & (chunk_sects - 1);
-		sector >>= chunksect_bits;
-		/* chunk in zone */
-		chunk = *sector_offset;
-		/* quotient is the chunk in real device*/
-		sector_div(chunk, zone->nb_dev << chunksect_bits);
-	} else{
-		sect_in_chunk = sector_div(sector, chunk_sects);
-		chunk = *sector_offset;
-		sector_div(chunk, chunk_sects * zone->nb_dev);
-	}
-	/*
-	*  position the bio over the real device
-	*  real sector = chunk in device + starting of zone
-	*	+ the position in the chunk
-	*/
-	*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
-	return conf->devlist[(zone - conf->strip_zone)*raid_disks
-			     + sector_div(sector, zone->nb_dev)];
-}
-
 /*
  * Is io distribute over 1 or more chunks ?
 */
@@ -505,7 +528,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	}
 
 	sector_offset = bio->bi_sector;
-	zone =  find_zone(mddev->private, &sector_offset);
+	zone = find_zone(mddev->private, &sector_offset);
 	tmp_dev = map_sector(mddev, zone, bio->bi_sector,
 			     &sector_offset);
 	bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +566,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 		return ERR_PTR(-EINVAL);
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		/* check slot number for a disk */
 		if (rdev->raid_disk == mddev->raid_disks-1) {
 			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
@@ -609,6 +632,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 static void *raid0_takeover_raid1(struct mddev *mddev)
 {
 	struct r0conf *priv_conf;
+	int chunksect;
 
 	/* Check layout:
 	 *  - (N - 1) mirror drives must be already faulty
@@ -619,10 +643,25 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 		return ERR_PTR(-EINVAL);
 	}
 
+	/*
+	 * a raid1 doesn't have the notion of chunk size, so
+	 * figure out the largest suitable size we can use.
+	 */
+	chunksect = 64 * 2; /* 64K by default */
+
+	/* The array must be an exact multiple of chunksize */
+	while (chunksect && (mddev->array_sectors & (chunksect - 1)))
+		chunksect >>= 1;
+
+	if ((chunksect << 9) < PAGE_SIZE)
+		/* array size does not allow a suitable chunk size */
+		return ERR_PTR(-EINVAL);
+
 	/* Set new parameters */
 	mddev->new_level = 0;
 	mddev->new_layout = 0;
-	mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
+	mddev->new_chunk_sectors = chunksect;
+	mddev->chunk_sectors = chunksect;
 	mddev->delta_disks = 1 - mddev->raid_disks;
 	mddev->raid_disks = 1;
 	/* make sure it will be not marked as dirty */
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4..05539d9c97f 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
 struct strip_zone {
 	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
-	int nb_dev;		/* # of devices attached to the zone */
+	int	 nb_dev;	/* # of devices attached to the zone */
 };
 
 struct r0conf {
-	struct strip_zone *strip_zone;
-	struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
-	int nr_strip_zones;
+	struct strip_zone	*strip_zone;
+	struct md_rdev		**devlist; /* lists of rdevs, pointed to
+					    * by strip_zone->dev */
+	int			nr_strip_zones;
+	int			has_merge_bvec;	/* at least one member has
+						 * a merge_bvec_fn */
 };
 
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a368db2431a..d35e4c991e3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
+		    || test_bit(Unmerged, &rdev->flags)
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	return best_disk;
 }
 
+static int raid1_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
+{
+	struct mddev *mddev = q->queuedata;
+	struct r1conf *conf = mddev->private;
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+	int max = biovec->bv_len;
+
+	if (mddev->merge_check_needed) {
+		int disk;
+		rcu_read_lock();
+		for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+			struct md_rdev *rdev = rcu_dereference(
+				conf->mirrors[disk].rdev);
+			if (rdev && !test_bit(Faulty, &rdev->flags)) {
+				struct request_queue *q =
+					bdev_get_queue(rdev->bdev);
+				if (q->merge_bvec_fn) {
+					bvm->bi_sector = sector +
+						rdev->data_offset;
+					bvm->bi_bdev = rdev->bdev;
+					max = min(max, q->merge_bvec_fn(
+							  q, bvm, biovec));
+				}
+			}
+		}
+		rcu_read_unlock();
+	}
+	return max;
+
+}
+
 int md_raid1_congested(struct mddev *mddev, int bits)
 {
 	struct r1conf *conf = mddev->private;
@@ -624,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
 		return 1;
 
 	rcu_read_lock();
-	for (i = 0; i < conf->raid_disks; i++) {
+	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		conf->nr_waiting++;
-		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+		/* Wait for the barrier to drop.
+		 * However if there are already pending
+		 * requests (preventing the barrier from
+		 * rising completely), and the
+		 * pre-process bio queue isn't empty,
+		 * then don't wait, as we need to empty
+		 * that queue to get the nr_pending
+		 * count down.
+		 */
+		wait_event_lock_irq(conf->wait_barrier,
+				    !conf->barrier ||
+				    (conf->nr_pending &&
+				     current->bio_list &&
+				     !bio_list_empty(current->bio_list)),
 				    conf->resync_lock,
-				    );
+			);
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -1002,7 +1049,8 @@ read_again:
 			break;
 		}
 		r1_bio->bios[i] = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+		    || test_bit(Unmerged, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			continue;
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct mirror_info *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
+	struct request_queue *q = bdev_get_queue(rdev->bdev);
 
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
+	if (q->merge_bvec_fn) {
+		set_bit(Unmerged, &rdev->flags);
+		mddev->merge_check_needed = 1;
+	}
+
 	for (mirror = first; mirror <= last; mirror++) {
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
 
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
-			/* as we don't honour merge_bvec_fn, we must
-			 * never risk violating it, so limit
-			 * ->max_segments to one lying with a single
-			 * page, as a one page request is never in
-			 * violation.
-			 */
-			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-				blk_queue_max_segments(mddev->queue, 1);
-				blk_queue_segment_boundary(mddev->queue,
-							   PAGE_CACHE_SIZE - 1);
-			}
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
+	if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+		/* Some requests might not have seen this new
+		 * merge_bvec_fn.  We must wait for them to complete
+		 * before merging the device fully.
+		 * First we make sure any code which has tested
+		 * our function has submitted the request, then
+		 * we wait for all outstanding requests to complete.
+		 */
+		synchronize_sched();
+		raise_barrier(conf);
+		lower_barrier(conf);
+		clear_bit(Unmerged, &rdev->flags);
+	}
 	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
@@ -1682,7 +1738,7 @@ static int process_checks(struct r1bio *r1_bio)
 				s = sbio->bi_io_vec[j].bv_page;
 				if (memcmp(page_address(p),
 					   page_address(s),
-					   PAGE_SIZE))
+					   sbio->bi_io_vec[j].bv_len))
 					break;
 			}
 		} else
@@ -2330,8 +2386,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		int ok = 1;
 		for (i = 0 ; i < conf->raid_disks * 2 ; i++)
 			if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
-				struct md_rdev *rdev =
-					rcu_dereference(conf->mirrors[i].rdev);
+				struct md_rdev *rdev = conf->mirrors[i].rdev;
 				ok = rdev_set_badblocks(rdev, sector_nr,
 							min_bad, 0
 					) && ok;
@@ -2491,7 +2546,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
 	err = -EINVAL;
 	spin_lock_init(&conf->device_lock);
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		int disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
@@ -2580,11 +2635,13 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	return ERR_PTR(err);
 }
 
+static int stop(struct mddev *mddev);
 static int run(struct mddev *mddev)
 {
 	struct r1conf *conf;
 	int i;
 	struct md_rdev *rdev;
+	int ret;
 
 	if (mddev->level != 1) {
 		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2609,20 +2666,11 @@ static int run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
 			continue;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_segments to 1 lying within
-		 * a single page, as a one page request is never in violation.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
 	}
 
 	mddev->degraded = 0;
@@ -2656,8 +2704,13 @@ static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 		mddev->queue->backing_dev_info.congested_data = mddev;
+		blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
 	}
-	return md_integrity_register(mddev);
+
+	ret =  md_integrity_register(mddev);
+	if (ret)
+		stop(mddev);
+	return ret;
 }
 
 static int stop(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e8aa213f0d..fff782189e4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -67,6 +67,7 @@ static int max_queued_requests = 1024;
 
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
+static int enough(struct r10conf *conf, int ignore);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * wait for the 'master' bio.
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
+	} else {
+		/* If all other devices that store this block have
+		 * failed, we want to return the error upwards rather
+		 * than fail the last device.  Here we redefine
+		 * "uptodate" to mean "Don't want to retry"
+		 */
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (!enough(conf, rdev->raid_disk))
+			uptodate = 1;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+	}
+	if (uptodate) {
 		raid_end_bio_io(r10_bio);
 		rdev_dec_pending(rdev, conf->mddev);
 	} else {
@@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
- *      If near_copies == raid_disk, there are no striping issues,
- *      but in that case, the function isn't called at all.
+ *	This requires checking for end-of-chunk if near_copies != raid_disks,
+ *	and for subordinate merge_bvec_fns if merge_check_needed.
  */
 static int raid10_mergeable_bvec(struct request_queue *q,
 				 struct bvec_merge_data *bvm,
 				 struct bio_vec *biovec)
 {
 	struct mddev *mddev = q->queuedata;
+	struct r10conf *conf = mddev->private;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
-	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-	if (max <= biovec->bv_len && bio_sectors == 0)
-		return biovec->bv_len;
-	else
-		return max;
+	if (conf->near_copies < conf->raid_disks) {
+		max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+					+ bio_sectors)) << 9;
+		if (max < 0)
+			/* bio_add cannot handle a negative return */
+			max = 0;
+		if (max <= biovec->bv_len && bio_sectors == 0)
+			return biovec->bv_len;
+	} else
+		max = biovec->bv_len;
+
+	if (mddev->merge_check_needed) {
+		struct r10bio r10_bio;
+		int s;
+		r10_bio.sector = sector;
+		raid10_find_phys(conf, &r10_bio);
+		rcu_read_lock();
+		for (s = 0; s < conf->copies; s++) {
+			int disk = r10_bio.devs[s].devnum;
+			struct md_rdev *rdev = rcu_dereference(
+				conf->mirrors[disk].rdev);
+			if (rdev && !test_bit(Faulty, &rdev->flags)) {
+				struct request_queue *q =
+					bdev_get_queue(rdev->bdev);
+				if (q->merge_bvec_fn) {
+					bvm->bi_sector = r10_bio.devs[s].addr
+						+ rdev->data_offset;
+					bvm->bi_bdev = rdev->bdev;
+					max = min(max, q->merge_bvec_fn(
+							  q, bvm, biovec));
+				}
+			}
+			rdev = rcu_dereference(conf->mirrors[disk].replacement);
+			if (rdev && !test_bit(Faulty, &rdev->flags)) {
+				struct request_queue *q =
+					bdev_get_queue(rdev->bdev);
+				if (q->merge_bvec_fn) {
+					bvm->bi_sector = r10_bio.devs[s].addr
+						+ rdev->data_offset;
+					bvm->bi_bdev = rdev->bdev;
+					max = min(max, q->merge_bvec_fn(
+							  q, bvm, biovec));
+				}
+			}
+		}
+		rcu_read_unlock();
+	}
+	return max;
 }
 
 /*
@@ -654,11 +711,12 @@ retry:
 		disk = r10_bio->devs[slot].devnum;
 		rdev = rcu_dereference(conf->mirrors[disk].replacement);
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Unmerged, &rdev->flags) ||
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 			rdev = rcu_dereference(conf->mirrors[disk].rdev);
-		if (rdev == NULL)
-			continue;
-		if (test_bit(Faulty, &rdev->flags))
+		if (rdev == NULL ||
+		    test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Unmerged, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		conf->nr_waiting++;
-		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+		/* Wait for the barrier to drop.
+		 * However if there are already pending
+		 * requests (preventing the barrier from
+		 * rising completely), and the
+		 * pre-process bio queue isn't empty,
+		 * then don't wait, as we need to empty
+		 * that queue to get the nr_pending
+		 * count down.
+		 */
+		wait_event_lock_irq(conf->wait_barrier,
+				    !conf->barrier ||
+				    (conf->nr_pending &&
+				     current->bio_list &&
+				     !bio_list_empty(current->bio_list)),
 				    conf->resync_lock,
-				    );
+			);
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -1107,12 +1178,14 @@ retry_write:
 			blocked_rdev = rrdev;
 			break;
 		}
-		if (rrdev && test_bit(Faulty, &rrdev->flags))
+		if (rrdev && (test_bit(Faulty, &rrdev->flags)
+			      || test_bit(Unmerged, &rrdev->flags)))
 			rrdev = NULL;
 
 		r10_bio->devs[i].bio = NULL;
 		r10_bio->devs[i].repl_bio = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Unmerged, &rdev->flags)) {
 			set_bit(R10BIO_Degraded, &r10_bio->state);
 			continue;
 		}
@@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int mirror;
 	int first = 0;
 	int last = conf->raid_disks - 1;
+	struct request_queue *q = bdev_get_queue(rdev->bdev);
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (!enough(conf, -1))
+	if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
+	if (q->merge_bvec_fn) {
+		set_bit(Unmerged, &rdev->flags);
+		mddev->merge_check_needed = 1;
+	}
+
 	if (rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
@@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			err = 0;
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
-			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-				blk_queue_max_segments(mddev->queue, 1);
-				blk_queue_segment_boundary(mddev->queue,
-							   PAGE_CACHE_SIZE - 1);
-			}
 			conf->fullsync = 1;
 			rcu_assign_pointer(p->replacement, rdev);
 			break;
@@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must
-		 * never risk violating it, so limit
-		 * ->max_segments to one lying with a single
-		 * page, as a one page request is never in
-		 * violation.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
 
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
-
+	if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+		/* Some requests might not have seen this new
+		 * merge_bvec_fn.  We must wait for them to complete
+		 * before merging the device fully.
+		 * First we make sure any code which has tested
+		 * our function has submitted the request, then
+		 * we wait for all outstanding requests to complete.
+		 */
+		synchronize_sched();
+		raise_barrier(conf, 0);
+		lower_barrier(conf);
+		clear_bit(Unmerged, &rdev->flags);
+	}
 	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
@@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
 	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 	if (repl)
 		rdev = conf->mirrors[d].replacement;
-	if (!rdev) {
-		smp_mb();
+	else
 		rdev = conf->mirrors[d].rdev;
-	}
 
 	if (!uptodate) {
 		if (repl)
@@ -1748,7 +1821,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			for (j = 0; j < vcnt; j++)
 				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
 					   page_address(tbio->bi_io_vec[j].bv_page),
-					   PAGE_SIZE))
+					   fbio->bi_io_vec[j].bv_len))
 					break;
 			if (j == vcnt)
 				continue;
@@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 		       "md/raid10:%s: %s: Failing raid device\n",
 		       mdname(mddev), b);
 		md_error(mddev, conf->mirrors[d].rdev);
+		r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
 		return;
 	}
 
@@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev &&
+			    !test_bit(Unmerged, &rdev->flags) &&
 			    test_bit(In_sync, &rdev->flags) &&
 			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
 					&first_bad, &bad_sectors) == 0) {
@@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 				    rdev,
 				    r10_bio->devs[r10_bio->read_slot].addr
 				    + sect,
-				    s, 0))
+				    s, 0)) {
 				md_error(mddev, rdev);
+				r10_bio->devs[r10_bio->read_slot].bio
+					= IO_BLOCKED;
+			}
 			break;
 		}
 
@@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (!rdev ||
+			    test_bit(Unmerged, &rdev->flags) ||
 			    !test_bit(In_sync, &rdev->flags))
 				continue;
 
@@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	 * This is all done synchronously while the array is
 	 * frozen.
 	 */
+	bio = r10_bio->devs[slot].bio;
+	bdevname(bio->bi_bdev, b);
+	bio_put(bio);
+	r10_bio->devs[slot].bio = NULL;
+
 	if (mddev->ro == 0) {
 		freeze_array(conf);
 		fix_read_error(conf, mddev, r10_bio);
 		unfreeze_array(conf);
-	}
+	} else
+		r10_bio->devs[slot].bio = IO_BLOCKED;
+
 	rdev_dec_pending(rdev, mddev);
 
-	bio = r10_bio->devs[slot].bio;
-	bdevname(bio->bi_bdev, b);
-	r10_bio->devs[slot].bio =
-		mddev->ro ? IO_BLOCKED : NULL;
 read_more:
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	if (rdev == NULL) {
@@ -2318,13 +2400,10 @@ read_more:
 		       mdname(mddev), b,
 		       (unsigned long long)r10_bio->sector);
 		raid_end_bio_io(r10_bio);
-		bio_put(bio);
 		return;
 	}
 
 	do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
-	if (bio)
-		bio_put(bio);
 	slot = r10_bio->read_slot;
 	printk_ratelimited(
 		KERN_ERR
@@ -2360,7 +2439,6 @@ read_more:
 			mbio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 		generic_make_request(bio);
-		bio = NULL;
 
 		r10_bio = mempool_alloc(conf->r10bio_pool,
 					GFP_NOIO);
@@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev)
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks / conf->near_copies));
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= conf->raid_disks
@@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev)
 			disk->rdev = rdev;
 		}
 
-		disk->rdev = rdev;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit max_segments to 1 lying
-		 * within a single page.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
 
 		disk->head_position = 0;
 	}
@@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
 
-	if (conf->near_copies < conf->raid_disks)
-		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+	blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
 
 	if (md_integrity_register(mddev))
 		goto out_free_conf;
@@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
 	}
 }
 
+static int raid10_resize(struct mddev *mddev, sector_t sectors)
+{
+	/* Resize of 'far' arrays is not supported.
+	 * For 'near' and 'offset' arrays we can set the
+	 * number of sectors used to be an appropriate multiple
+	 * of the chunk size.
+	 * For 'offset', this is far_copies*chunksize.
+	 * For 'near' the multiplier is the LCM of
+	 * near_copies and raid_disks.
+	 * So if far_copies > 1 && !far_offset, fail.
+	 * Else find LCM(raid_disks, near_copy)*far_copies and
+	 * multiply by chunk_size.  Then round to this number.
+	 * This is mostly done by raid10_size()
+	 */
+	struct r10conf *conf = mddev->private;
+	sector_t oldsize, size;
+
+	if (conf->far_copies > 1 && !conf->far_offset)
+		return -EINVAL;
+
+	oldsize = raid10_size(mddev, 0, 0);
+	size = raid10_size(mddev, sectors, 0);
+	md_set_array_sectors(mddev, size);
+	if (mddev->array_sectors > size)
+		return -EINVAL;
+	set_capacity(mddev->gendisk, mddev->array_sectors);
+	revalidate_disk(mddev->gendisk);
+	if (sectors > mddev->dev_sectors &&
+	    mddev->recovery_cp > oldsize) {
+		mddev->recovery_cp = oldsize;
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	}
+	mddev->dev_sectors = sectors;
+	mddev->resync_max_sectors = size;
+	return 0;
+}
+
 static void *raid10_takeover_raid0(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
@@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
 
 	conf = setup_conf(mddev);
 	if (!IS_ERR(conf)) {
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk >= 0)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 		conf->barrier = 1;
@@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality =
 	.sync_request	= sync_request,
 	.quiesce	= raid10_quiesce,
 	.size		= raid10_size,
+	.resize		= raid10_resize,
 	.takeover	= raid10_takeover,
 };
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62..f351422938e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
 			BUG_ON(stripe_operations_active(sh));
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				if (atomic_dec_return(&conf->preread_active_stripes)
+				    < IO_THRESHOLD)
 					md_wakeup_thread(conf->mddev->thread);
-			}
 			atomic_dec(&conf->active_stripes);
 			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 				list_add_tail(&sh->lru, &conf->inactive_list);
@@ -2472,39 +2471,41 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	int abort = 0;
 	int i;
 
-	md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
 	clear_bit(STRIPE_SYNCING, &sh->state);
 	s->syncing = 0;
 	s->replacing = 0;
 	/* There is nothing more to do for sync/check/repair.
+	 * Don't even need to abort as that is handled elsewhere
+	 * if needed, and not always wanted e.g. if there is a known
+	 * bad block here.
 	 * For recover/replace we need to record a bad block on all
 	 * non-sync devices, or abort the recovery
 	 */
-	if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
-		return;
-	/* During recovery devices cannot be removed, so locking and
-	 * refcounting of rdevs is not needed
-	 */
-	for (i = 0; i < conf->raid_disks; i++) {
-		struct md_rdev *rdev = conf->disks[i].rdev;
-		if (rdev
-		    && !test_bit(Faulty, &rdev->flags)
-		    && !test_bit(In_sync, &rdev->flags)
-		    && !rdev_set_badblocks(rdev, sh->sector,
-					   STRIPE_SECTORS, 0))
-			abort = 1;
-		rdev = conf->disks[i].replacement;
-		if (rdev
-		    && !test_bit(Faulty, &rdev->flags)
-		    && !test_bit(In_sync, &rdev->flags)
-		    && !rdev_set_badblocks(rdev, sh->sector,
-					   STRIPE_SECTORS, 0))
-			abort = 1;
-	}
-	if (abort) {
-		conf->recovery_disabled = conf->mddev->recovery_disabled;
-		set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
+	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
+		/* During recovery devices cannot be removed, so
+		 * locking and refcounting of rdevs is not needed
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			struct md_rdev *rdev = conf->disks[i].rdev;
+			if (rdev
+			    && !test_bit(Faulty, &rdev->flags)
+			    && !test_bit(In_sync, &rdev->flags)
+			    && !rdev_set_badblocks(rdev, sh->sector,
+						   STRIPE_SECTORS, 0))
+				abort = 1;
+			rdev = conf->disks[i].replacement;
+			if (rdev
+			    && !test_bit(Faulty, &rdev->flags)
+			    && !test_bit(In_sync, &rdev->flags)
+			    && !rdev_set_badblocks(rdev, sh->sector,
+						   STRIPE_SECTORS, 0))
+				abort = 1;
+		}
+		if (abort)
+			conf->recovery_disabled =
+				conf->mddev->recovery_disabled;
 	}
+	md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
 }
 
 static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -3204,7 +3205,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			/* Not in-sync */;
 		else if (is_bad) {
 			/* also not in-sync */
-			if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+			    test_bit(R5_UPTODATE, &dev->flags)) {
 				/* treat as in-sync, but with a read error
 				 * which we can now try to correct
 				 */
@@ -3277,12 +3279,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 		/* If there is a failed device being replaced,
 		 *     we must be recovering.
 		 * else if we are after recovery_cp, we must be syncing
+		 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
 		 * else we can only be replacing
 		 * sync and recovery both need to read all devices, and so
 		 * use the same flag.
 		 */
 		if (do_recovery ||
-		    sh->sector >= conf->mddev->recovery_cp)
+		    sh->sector >= conf->mddev->recovery_cp ||
+		    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
 			s->syncing = 1;
 		else
 			s->replacing = 1;
@@ -4843,7 +4847,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
 
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
+	rdev_for_each(rdev, mddev) {
 		raid_disk = rdev->raid_disk;
 		if (raid_disk >= max_disks
 		    || raid_disk < 0)
@@ -5178,7 +5182,7 @@ static int run(struct mddev *mddev)
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
 
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev_for_each(rdev, mddev)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 	}
@@ -5362,7 +5366,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
-	if (has_failed(conf))
+	if (rdev->saved_raid_disk < 0 && has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
 
@@ -5501,7 +5505,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	rdev_for_each(rdev, mddev)
 		if (!test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags))
 			spares++;
@@ -5547,16 +5551,14 @@ static int raid5_start_reshape(struct mddev *mddev)
 	 * such devices during the reshape and confusion could result.
 	 */
 	if (mddev->delta_disks >= 0) {
-		int added_devices = 0;
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk < 0 &&
 			    !test_bit(Faulty, &rdev->flags)) {
 				if (raid5_add_disk(mddev, rdev) == 0) {
 					if (rdev->raid_disk
-					    >= conf->previous_raid_disks) {
+					    >= conf->previous_raid_disks)
 						set_bit(In_sync, &rdev->flags);
-						added_devices++;
-					} else
+					else
 						rdev->recovery_offset = 0;
 
 					if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5568,6 @@ static int raid5_start_reshape(struct mddev *mddev)
 				   && !test_bit(Faulty, &rdev->flags)) {
 				/* This is a spare that was manually added */
 				set_bit(In_sync, &rdev->flags);
-				added_devices++;
 			}
 
 		/* When a reshape changes the number of devices,
@@ -5592,6 +5593,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
 		conf->reshape_progress = MaxSector;
+		mddev->reshape_position = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
 	}
author	Ingo Molnar <mingo@kernel.org>	2012-04-14 13:18:27 +0200
committer	Ingo Molnar <mingo@kernel.org>	2012-04-14 13:19:04 +0200
commit	6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e (patch)
tree	021cc9f6b477146fcebe6f3be4752abfa2ba18a9 /drivers/md
parent	682968e0c425c60f0dde37977e5beb2b12ddc4cc (diff)
parent	a385ec4f11bdcf81af094c03e2444ee9b7fad2e5 (diff)