From 2c4193df379bb89114ff60d4b0fa66131abe6a75 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 18 Jul 2011 17:38:43 +1000
Subject: md/raid10: get rid of duplicated conditional expression

Variable 'first' is initialized to zero and updated to @rdev->raid_disk
only if it is greater than 0. Thus condition '>= first' always implies
'>= 0' so the latter is not needed.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md/raid10.c')
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e846688962..d55ae12b2d8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1093,8 +1093,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
-	if (rdev->saved_raid_disk >= 0 &&
-	    rdev->saved_raid_disk >= first &&
+	if (rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else
-- 
cgit v1.2.3-70-g09d2


From 778ca01852e6cc9ff335119b37a1938a978df384 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 18 Jul 2011 17:38:47 +1000
Subject: md/raid10: factor out common bio handling code

When normal-write and sync-read/write bio completes, we should
find out the disk number the bio belongs to. Factor those common
code out to a separate function.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d55ae12b2d8..e434f1e8d22 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -244,6 +244,23 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
 }
 
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, struct bio *bio)
+{
+	int slot;
+
+	for (slot = 0; slot < conf->copies; slot++)
+		if (r10_bio->devs[slot].bio == bio)
+			break;
+
+	BUG_ON(slot == conf->copies);
+	update_head_pos(slot, r10_bio);
+
+	return r10_bio->devs[slot].devnum;
+}
+
 static void raid10_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -289,13 +306,10 @@ static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t *r10_bio = bio->bi_private;
-	int slot, dev;
+	int dev;
 	conf_t *conf = r10_bio->mddev->private;
 
-	for (slot = 0; slot < conf->copies; slot++)
-		if (r10_bio->devs[slot].bio == bio)
-			break;
-	dev = r10_bio->devs[slot].devnum;
+	dev = find_bio_disk(conf, r10_bio, bio);
 
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
@@ -316,8 +330,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 
-	update_head_pos(slot, r10_bio);
-
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
@@ -1173,14 +1185,9 @@ static void end_sync_read(struct bio *bio, int error)
 {
 	r10bio_t *r10_bio = bio->bi_private;
 	conf_t *conf = r10_bio->mddev->private;
-	int i,d;
+	int d;
 
-	for (i=0; i<conf->copies; i++)
-		if (r10_bio->devs[i].bio == bio)
-			break;
-	BUG_ON(i == conf->copies);
-	update_head_pos(i, r10_bio);
-	d = r10_bio->devs[i].devnum;
+	d = find_bio_disk(conf, r10_bio, bio);
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1211,18 +1218,13 @@ static void end_sync_write(struct bio *bio, int error)
 	r10bio_t *r10_bio = bio->bi_private;
 	mddev_t *mddev = r10_bio->mddev;
 	conf_t *conf = mddev->private;
-	int i,d;
+	int d;
 
-	for (i = 0; i < conf->copies; i++)
-		if (r10_bio->devs[i].bio == bio)
-			break;
-	d = r10_bio->devs[i].devnum;
+	d = find_bio_disk(conf, r10_bio, bio);
 
 	if (!uptodate)
 		md_error(mddev, conf->mirrors[d].rdev);
 
-	update_head_pos(i, r10_bio);
-
 	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
-- 
cgit v1.2.3-70-g09d2


From c65060ad4274f70048d62e0a86332cd3fd23f28d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 18 Jul 2011 17:38:49 +1000
Subject: md/raid10: share pages between read and write bio's during recovery

When performing a recovery, only first 2 slots in r10_bio are in use,
for read and write respectively. However all of pages in the write bio
are never used and just replaced to read bio's when the read completes.

Get rid of those unused pages and share read pages properly.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e434f1e8d22..3715e220e5e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -123,7 +123,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	for (j = 0 ; j < nalloc; j++) {
 		bio = r10_bio->devs[j].bio;
 		for (i = 0; i < RESYNC_PAGES; i++) {
-			page = alloc_page(gfp_flags);
+			if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
+						&conf->mddev->recovery)) {
+				/* we can share bv_page's during recovery */
+				struct bio *rbio = r10_bio->devs[0].bio;
+				page = rbio->bi_io_vec[i].bv_page;
+				get_page(page);
+			} else
+				page = alloc_page(gfp_flags);
 			if (unlikely(!page))
 				goto out_free_pages;
 
@@ -1360,20 +1367,14 @@ done:
 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
 	conf_t *conf = mddev->private;
-	int i, d;
-	struct bio *bio, *wbio;
-
+	int d;
+	struct bio *wbio;
 
-	/* move the pages across to the second bio
+	/*
+	 * share the pages with the first bio
 	 * and submit the write request
 	 */
-	bio = r10_bio->devs[0].bio;
 	wbio = r10_bio->devs[1].bio;
-	for (i=0; i < wbio->bi_vcnt; i++) {
-		struct page *p = bio->bi_io_vec[i].bv_page;
-		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
-		wbio->bi_io_vec[i].bv_page = p;
-	}
 	d = r10_bio->devs[1].devnum;
 
 	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-- 
cgit v1.2.3-70-g09d2


From 8bda470e8ebde35f9349e98ecbce4dfb508a60fa Mon Sep 17 00:00:00 2001
From: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid: use printk_ratelimited instead of printk_ratelimit

As per printk_ratelimit comment, it should not be used.

Signed-off-by: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 25 +++++++++++++++----------
 drivers/md/raid10.c | 23 +++++++++++++----------
 drivers/md/raid5.c  | 47 +++++++++++++++++++++++++----------------------
 3 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d844..d3a8f4bb4fc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
@@ -287,10 +288,13 @@ static void raid1_end_read_request(struct bio *bio, int error)
 		 * oops, read error:
 		 */
 		char b[BDEVNAME_SIZE];
-		if (printk_ratelimit())
-			printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
-			       mdname(conf->mddev),
-			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+		printk_ratelimited(
+			KERN_ERR "md/raid1:%s: %s: "
+			"rescheduling sector %llu\n",
+			mdname(conf->mddev),
+			bdevname(conf->mirrors[mirror].rdev->bdev,
+				 b),
+			(unsigned long long)r1_bio->sector);
 		reschedule_retry(r1_bio);
 	}
 
@@ -1580,12 +1584,13 @@ static void raid1d(mddev_t *mddev)
 						      GFP_NOIO, mddev);
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
-				if (printk_ratelimit())
-					printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
-					       " other mirror: %s\n",
-					       mdname(mddev),
-					       (unsigned long long)r1_bio->sector,
-					       bdevname(rdev->bdev,b));
+				printk_ratelimited(
+					KERN_ERR
+					"md/raid1:%s: redirecting sector %llu"
+					" to other mirror: %s\n",
+					mdname(mddev),
+					(unsigned long long)r1_bio->sector,
+					bdevname(rdev->bdev, b));
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3715e220e5e..1725ec1e1e8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
@@ -301,10 +302,11 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * oops, read error - keep the refcount on the rdev
 		 */
 		char b[BDEVNAME_SIZE];
-		if (printk_ratelimit())
-			printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
-			       mdname(conf->mddev),
-			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+		printk_ratelimited(KERN_ERR
+				   "md/raid10:%s: %s: rescheduling sector %llu\n",
+				   mdname(conf->mddev),
+				   bdevname(conf->mirrors[dev].rdev->bdev, b),
+				   (unsigned long long)r10_bio->sector);
 		reschedule_retry(r10_bio);
 	}
 }
@@ -1669,12 +1671,13 @@ static void raid10d(mddev_t *mddev)
 				bio_put(bio);
 				slot = r10_bio->read_slot;
 				rdev = conf->mirrors[mirror].rdev;
-				if (printk_ratelimit())
-					printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
-					       " another mirror\n",
-					       mdname(mddev),
-					       bdevname(rdev->bdev,b),
-					       (unsigned long long)r10_bio->sector);
+				printk_ratelimited(
+					KERN_ERR
+					"md/raid10:%s: %s: redirecting"
+					"sector %llu to another mirror\n",
+					mdname(mddev),
+					bdevname(rdev->bdev, b),
+					(unsigned long long)r10_bio->sector);
 				bio = bio_clone_mddev(r10_bio->master_bio,
 						      GFP_NOIO, mddev);
 				r10_bio->devs[slot].bio = bio;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b321d6c3659..467e8e1cd3d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
@@ -96,8 +97,6 @@
 #define __inline__
 #endif
 
-#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -1583,12 +1582,14 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			rdev = conf->disks[i].rdev;
-			printk_rl(KERN_INFO "md/raid:%s: read error corrected"
-				  " (%lu sectors at %llu on %s)\n",
-				  mdname(conf->mddev), STRIPE_SECTORS,
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdevname(rdev->bdev, b));
+			printk_ratelimited(
+				KERN_INFO
+				"md/raid:%s: read error corrected"
+				" (%lu sectors at %llu on %s)\n",
+				mdname(conf->mddev), STRIPE_SECTORS,
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdevname(rdev->bdev, b));
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
@@ -1602,22 +1603,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
 		if (conf->mddev->degraded >= conf->max_degraded)
-			printk_rl(KERN_WARNING
-				  "md/raid:%s: read error not correctable "
-				  "(sector %llu on %s).\n",
-				  mdname(conf->mddev),
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdn);
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error not correctable "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
-			printk_rl(KERN_WARNING
-				  "md/raid:%s: read error NOT corrected!! "
-				  "(sector %llu on %s).\n",
-				  mdname(conf->mddev),
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdn);
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error NOT corrected!! "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
 		else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
 			printk(KERN_WARNING
-- 
cgit v1.2.3-70-g09d2


From 2bb77736ae5dca0a189829fbb7379d43364a9dac Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid10: Make use of new recovery_disabled handling

When we get a read error during recovery, RAID10 previously
arranged for the recovering device to appear to fail so that
the recovery stops and doesn't restart.  This is misleading and wrong.

Instead, make use of the new recovery_disabled handling and mark
the target device and having recovery disabled.

Add appropriate checks in add_disk and remove_disk so that devices
are removed and not re-added when recovery is disabled.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 62 +++++++++++++++++++++++++++++++----------------------
 drivers/md/raid10.h |  5 +++++
 2 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1725ec1e1e8..5583201e5cd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1099,7 +1099,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	conf_t *conf = mddev->private;
 	int err = -EEXIST;
 	int mirror;
-	mirror_info_t *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
@@ -1119,32 +1118,36 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		mirror = rdev->saved_raid_disk;
 	else
 		mirror = first;
-	for ( ; mirror <= last ; mirror++)
-		if ( !(p=conf->mirrors+mirror)->rdev) {
-
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
-			/* as we don't honour merge_bvec_fn, we must
-			 * never risk violating it, so limit
-			 * ->max_segments to one lying with a single
-			 * page, as a one page request is never in
-			 * violation.
-			 */
-			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-				blk_queue_max_segments(mddev->queue, 1);
-				blk_queue_segment_boundary(mddev->queue,
-							   PAGE_CACHE_SIZE - 1);
-			}
+	for ( ; mirror <= last ; mirror++) {
+		mirror_info_t *p = &conf->mirrors[mirror];
+		if (p->recovery_disabled == mddev->recovery_disabled)
+			continue;
+		if (!p->rdev)
+			continue;
 
-			p->head_position = 0;
-			rdev->raid_disk = mirror;
-			err = 0;
-			if (rdev->saved_raid_disk != mirror)
-				conf->fullsync = 1;
-			rcu_assign_pointer(p->rdev, rdev);
-			break;
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
+		/* as we don't honour merge_bvec_fn, we must
+		 * never risk violating it, so limit
+		 * ->max_segments to one lying with a single
+		 * page, as a one page request is never in
+		 * violation.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
 		}
 
+		p->head_position = 0;
+		rdev->raid_disk = mirror;
+		err = 0;
+		if (rdev->saved_raid_disk != mirror)
+			conf->fullsync = 1;
+		rcu_assign_pointer(p->rdev, rdev);
+		break;
+	}
+
 	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
@@ -1169,6 +1172,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
 		 * is not possible.
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
+		    mddev->recovery_disabled != p->recovery_disabled &&
 		    enough(conf)) {
 			err = -EBUSY;
 			goto abort;
@@ -1383,8 +1387,14 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
 	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
 		generic_make_request(wbio);
-	else
-		bio_endio(wbio, -EIO);
+	else {
+		printk(KERN_NOTICE
+		       "md/raid10:%s: recovery aborted due to read error\n",
+		       mdname(mddev));
+		conf->mirrors[d].recovery_disabled = mddev->recovery_disabled;
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		bio_endio(wbio, 0);
+	}
 }
 
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 944b1104d3b..a485914c48c 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
 struct mirror_info {
 	mdk_rdev_t	*rdev;
 	sector_t	head_position;
+	int		recovery_disabled;	/* matches
+						 * mddev->recovery_disabled
+						 * when we shouldn't try
+						 * recovering this device.
+						 */
 };
 
 typedef struct r10bio_s r10bio_t;
-- 
cgit v1.2.3-70-g09d2


From 700c72138938cf428c74379806886c6b017d6295 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid10: Improve decision on whether to fail a device with a read
 error.

Normally we would fail a device with a READ error.  However if doing
so causes the array to fail, it is better to leave the device
in place and just return the read error to the caller.

The current test for decide if the array will fail is overly
simplistic.
We have a function 'enough' which can tell if the array is failed or
not, so use it to guide the decision.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 57 ++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5583201e5cd..6721cb08035 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -970,6 +970,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 	seq_printf(seq, "]");
 }
 
+/* check if there are enough drives for
+ * every block to appear on atleast one.
+ * Don't consider the device numbered 'ignore'
+ * as we might be about to remove it.
+ */
+static int enough(conf_t *conf, int ignore)
+{
+	int first = 0;
+
+	do {
+		int n = conf->copies;
+		int cnt = 0;
+		while (n--) {
+			if (conf->mirrors[first].rdev &&
+			    first != ignore)
+				cnt++;
+			first = (first+1) % conf->raid_disks;
+		}
+		if (cnt == 0)
+			return 0;
+	} while (first != 0);
+	return 1;
+}
+
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
@@ -982,13 +1006,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
-	    && conf->raid_disks-mddev->degraded == 1)
+	    && !enough(conf, rdev->raid_disk))
 		/*
 		 * Don't fail the drive, just return an IO error.
-		 * The test should really be more sophisticated than
-		 * "working_disks == 1", but it isn't critical, and
-		 * can wait until we do more sophisticated "is the drive
-		 * really dead" tests...
 		 */
 		return;
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -1043,27 +1063,6 @@ static void close_sync(conf_t *conf)
 	conf->r10buf_pool = NULL;
 }
 
-/* check if there are enough drives for
- * every block to appear on atleast one
- */
-static int enough(conf_t *conf)
-{
-	int first = 0;
-
-	do {
-		int n = conf->copies;
-		int cnt = 0;
-		while (n--) {
-			if (conf->mirrors[first].rdev)
-				cnt++;
-			first = (first+1) % conf->raid_disks;
-		}
-		if (cnt == 0)
-			return 0;
-	} while (first != 0);
-	return 1;
-}
-
 static int raid10_spare_active(mddev_t *mddev)
 {
 	int i;
@@ -1107,7 +1106,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (!enough(conf))
+	if (!enough(conf, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
@@ -1173,7 +1172,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
 		    mddev->recovery_disabled != p->recovery_disabled &&
-		    enough(conf)) {
+		    enough(conf, -1)) {
 			err = -EBUSY;
 			goto abort;
 		}
@@ -2286,7 +2285,7 @@ static int run(mddev_t *mddev)
 		disk->head_position = 0;
 	}
 	/* need to check that every block has at least one working mirror */
-	if (!enough(conf)) {
+	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
 		       mdname(mddev));
 		goto out_free_conf;
-- 
cgit v1.2.3-70-g09d2


From cbea21703b2484f83faef040ed1de30114794392 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid10: move rdev->corrected_errors counting

Read errors are considered to corrected if write-back and re-read
cycle is finished without further problems. Thus moving the rdev->
corrected_errors counting after the re-reading looks more reasonable
IMHO.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6721cb08035..5def27c28be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1533,7 +1533,6 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 			    test_bit(In_sync, &rdev->flags)) {
 				atomic_inc(&rdev->nr_pending);
 				rcu_read_unlock();
-				atomic_add(s, &rdev->corrected_errors);
 				if (sync_page_io(rdev,
 						 r10_bio->devs[sl].addr +
 						 sect,
@@ -1598,6 +1597,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 					       (unsigned long long)(
 						       sect + rdev->data_offset),
 					       bdevname(rdev->bdev, b));
+					atomic_add(s, &rdev->corrected_errors);
 				}
 
 				rdev_dec_pending(rdev, mddev);
-- 
cgit v1.2.3-70-g09d2


From 34b343cff4354ab9864be83be88405fd53d928a0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:47 +1000
Subject: md: don't allow arrays to contain devices with bad blocks.

As no personality understand bad block lists yet, we must
reject any device that is known to contain bad blocks.
As the personalities get taught, these tests can be removed.

This only applies to raid1/raid5/raid10.
For linear/raid0/multipath/faulty the whole concept of bad blocks
doesn't mean anything so there is no point adding the checks.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c  | 7 +++++++
 drivers/md/raid10.c | 8 ++++++++
 drivers/md/raid5.c  | 7 +++++++
 3 files changed, 22 insertions(+)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3cbf0ac2aaa..8db311d7cdd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1055,6 +1055,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1994,6 +1997,10 @@ static int run(mddev_t *mddev)
 			blk_queue_segment_boundary(mddev->queue,
 						   PAGE_CACHE_SIZE - 1);
 		}
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid1: Cannot handle bad blocks yet\n");
+			return -EINVAL;
+		}
 	}
 
 	mddev->degraded = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5def27c28be..8aadd2f52dc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1101,6 +1101,9 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
@@ -2263,6 +2266,11 @@ static int run(mddev_t *mddev)
 				 (conf->raid_disks / conf->near_copies));
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid10: cannot handle bad blocks yet\n");
+			goto out_free_conf;
+		}
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b874f42694e..719445004dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4667,6 +4667,10 @@ static int run(mddev_t *mddev)
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid5: cannot handle bad blocks yet\n");
+			goto abort;
+		}
 		if (rdev->raid_disk < 0)
 			continue;
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -4975,6 +4979,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From de393cdea66cbd63c90725663f400c76faf1b255 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:48 +1000
Subject: md: make it easier to wait for bad blocks to be acknowledged.

It is only safe to choose not to write to a bad block if that bad
block is safely recorded in metadata - i.e. if it has been
'acknowledged'.

If it hasn't we need to wait for the acknowledgement.

We support that using rdev->blocked wait and
md_wait_for_blocked_rdev by introducing a new device flag
'BlockedBadBlock'.

This flag is only advisory.
It is cleared whenever we acknowledge a bad block, so that a waiter
can re-check the particular bad blocks that it is interested it.

It should be set by a caller when they find they need to wait.
This (set after test) is inherently racy, but as
md_wait_for_blocked_rdev already has a timeout, losing the race will
have minimal impact.

When we clear "Blocked" was also clear "BlockedBadBlocks" incase it
was set incorrectly (see above race).

We also modify the way we manage 'Blocked' to fit better with the new
handling of 'BlockedBadBlocks' and to make it consistent between
externally managed and internally managed metadata.   This requires
that each raidXd loop checks if the metadata needs to be written and
triggers a write (md_check_recovery) if needed.  Otherwise a queued
write request might cause raidXd to wait for the metadata to write,
and only that thread can write it.

Before writing metadata, we set FaultRecorded for all devices that
are Faulty, then after writing the metadata we clear Blocked for any
device for which the Fault was certainly Recorded.

The 'faulty' device flag now appears in sysfs if the device is faulty
*or* it has unacknowledged bad blocks.  So user-space which does not
understand bad blocks can continue to function correctly.
User space which does, should not assume a device is faulty until it
sees the 'faulty' flag, and then sees the list of unacknowledged bad
blocks is empty.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 77 ++++++++++++++++++++++++++++++++++++-----------------
 drivers/md/md.h     | 25 +++++++++++++++--
 drivers/md/raid1.c  |  3 +++
 drivers/md/raid10.c |  3 +++
 drivers/md/raid5.c  |  4 +++
 5 files changed, 85 insertions(+), 27 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1520d18c5af..a6b6471da2b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2341,8 +2341,18 @@ repeat:
 	if (!mddev->persistent) {
 		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
 		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
-		if (!mddev->external)
+		if (!mddev->external) {
 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			list_for_each_entry(rdev, &mddev->disks, same_set) {
+				if (rdev->badblocks.changed) {
+					md_ack_all_badblocks(&rdev->badblocks);
+					md_error(mddev, rdev);
+				}
+				clear_bit(Blocked, &rdev->flags);
+				clear_bit(BlockedBadBlocks, &rdev->flags);
+				wake_up(&rdev->blocked_wait);
+			}
+		}
 		wake_up(&mddev->sb_wait);
 		return;
 	}
@@ -2399,9 +2409,12 @@ repeat:
 		mddev->events --;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->badblocks.changed)
 			any_badblocks_changed++;
+		if (test_bit(Faulty, &rdev->flags))
+			set_bit(FaultRecorded, &rdev->flags);
+	}
 
 	sync_sbs(mddev, nospares);
 	spin_unlock_irq(&mddev->write_lock);
@@ -2458,9 +2471,15 @@ repeat:
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
-	if (any_badblocks_changed)
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+			clear_bit(Blocked, &rdev->flags);
+
+		if (any_badblocks_changed)
 			md_ack_all_badblocks(&rdev->badblocks);
+		clear_bit(BlockedBadBlocks, &rdev->flags);
+		wake_up(&rdev->blocked_wait);
+	}
 }
 
 /* words written to sysfs files may, or may not, be \n terminated.
@@ -2495,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
 	char *sep = "";
 	size_t len = 0;
 
-	if (test_bit(Faulty, &rdev->flags)) {
+	if (test_bit(Faulty, &rdev->flags) ||
+	    rdev->badblocks.unacked_exist) {
 		len+= sprintf(page+len, "%sfaulty",sep);
 		sep = ",";
 	}
@@ -2507,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
 		len += sprintf(page+len, "%swrite_mostly",sep);
 		sep = ",";
 	}
-	if (test_bit(Blocked, &rdev->flags)) {
+	if (test_bit(Blocked, &rdev->flags) ||
+	    rdev->badblocks.unacked_exist) {
 		len += sprintf(page+len, "%sblocked", sep);
 		sep = ",";
 	}
@@ -2527,12 +2548,12 @@ static ssize_t
 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
 	/* can write
-	 *  faulty  - simulates and error
+	 *  faulty  - simulates an error
 	 *  remove  - disconnects the device
 	 *  writemostly - sets write_mostly
 	 *  -writemostly - clears write_mostly
-	 *  blocked - sets the Blocked flag
-	 *  -blocked - clears the Blocked flag
+	 *  blocked - sets the Blocked flags
+	 *  -blocked - clears the Blocked and possibly simulates an error
 	 *  insync - sets Insync providing device isn't active
 	 *  write_error - sets WriteErrorSeen
 	 *  -write_error - clears WriteErrorSeen
@@ -2562,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		set_bit(Blocked, &rdev->flags);
 		err = 0;
 	} else if (cmd_match(buf, "-blocked")) {
+		if (!test_bit(Faulty, &rdev->flags) &&
+		    test_bit(BlockedBadBlocks, &rdev->flags)) {
+			/* metadata handler doesn't understand badblocks,
+			 * so we need to fail the device
+			 */
+			md_error(rdev->mddev, rdev);
+		}
 		clear_bit(Blocked, &rdev->flags);
+		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
@@ -2881,7 +2910,11 @@ static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
 }
 static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
 {
-	return badblocks_store(&rdev->badblocks, page, len, 0);
+	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+	/* Maybe that ack was all we needed */
+	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+		wake_up(&rdev->blocked_wait);
+	return rv;
 }
 static struct rdev_sysfs_entry rdev_bad_blocks =
 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
@@ -6398,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!rdev || test_bit(Faulty, &rdev->flags))
 		return;
 
-	if (mddev->external)
-		set_bit(Blocked, &rdev->flags);
-/*
-	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
-		mdname(mddev),
-		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
-		__builtin_return_address(0),__builtin_return_address(1),
-		__builtin_return_address(2),__builtin_return_address(3));
-*/
-	if (!mddev->pers)
-		return;
-	if (!mddev->pers->error_handler)
+	if (!mddev->pers || !mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
@@ -7286,8 +7308,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
-			    !test_bit(Faulty, &rdev->flags) &&
-			    !test_bit(Blocked, &rdev->flags))
+			    !test_bit(Faulty, &rdev->flags))
 				spares++;
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
@@ -7533,7 +7554,8 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
 	sysfs_notify_dirent_safe(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
-			   !test_bit(Blocked, &rdev->flags),
+			   !test_bit(Blocked, &rdev->flags) &&
+			   !test_bit(BlockedBadBlocks, &rdev->flags),
 			   msecs_to_jiffies(5000));
 	rdev_dec_pending(rdev, mddev);
 }
@@ -7779,6 +7801,8 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	}
 
 	bb->changed = 1;
+	if (!acknowledged)
+		bb->unacked_exist = 1;
 	write_sequnlock_irq(&bb->lock);
 
 	return rv;
@@ -7923,6 +7947,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
 				p[i] = BB_MAKE(start, len, 1);
 			}
 		}
+		bb->unacked_exist = 0;
 	}
 	write_sequnlock_irq(&bb->lock);
 }
@@ -7970,6 +7995,8 @@ retry:
 				(unsigned long long)s << bb->shift,
 				length << bb->shift);
 	}
+	if (unack && len == 0)
+		bb->unacked_exist = 0;
 
 	if (read_seqretry(&bb->lock, seq))
 		goto retry;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fa4b607854a..1e586bb4452 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -81,12 +81,29 @@ struct mdk_rdev_s
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
 #define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occurred on an externally
-					 * managed array, don't allow writes
+#define Blocked		8		/* An error occurred but has not yet
+					 * been acknowledged by the metadata
+					 * handler, so don't allow writes
 					 * until it is cleared */
 #define WriteErrorSeen	9		/* A write error has been seen on this
 					 * device
 					 */
+#define FaultRecorded	10		/* Intermediate state for clearing
+					 * Blocked.  The Fault is/will-be
+					 * recorded in the metadata, but that
+					 * metadata hasn't been stored safely
+					 * on disk yet.
+					 */
+#define BlockedBadBlocks 11		/* A writer is blocked because they
+					 * found an unacknowledged bad-block.
+					 * This can safely be cleared at any
+					 * time, and the writer will re-check.
+					 * It may be set at any time, and at
+					 * worst the writer will timeout and
+					 * re-check.  So setting it as
+					 * accurately as possible is good, but
+					 * not absolutely critical.
+					 */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
@@ -124,6 +141,10 @@ struct mdk_rdev_s
 
 	struct badblocks {
 		int	count;		/* count of bad blocks */
+		int	unacked_exist;	/* there probably are unacknowledged
+					 * bad blocks.  This is only cleared
+					 * when a read discovers none
+					 */
 		int	shift;		/* shift from sectors to block size
 					 * a -ve shift means badblocks are
 					 * disabled.*/
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8c31c39b6f8..4d40d9d54a2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1059,6 +1059,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		conf->recovery_disabled = mddev->recovery_disabled;
 		return;
 	}
+	set_bit(Blocked, &rdev->flags);
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
@@ -1751,6 +1752,8 @@ read_more:
 			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
 		}
 		cond_resched();
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8aadd2f52dc..fe6692e6221 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1021,6 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
+	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
@@ -1703,6 +1704,8 @@ static void raid10d(mddev_t *mddev)
 			}
 		}
 		cond_resched();
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 719445004dd..304389ba5e2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1706,6 +1706,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
+	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
@@ -4143,6 +4144,9 @@ static void raid5d(mddev_t *mddev)
 		release_stripe(sh);
 		cond_resched();
 
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
+
 		spin_lock_irq(&conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
-- 
cgit v1.2.3-70-g09d2


From 1294b9c973251a5e68b62c9b40dd914517bda675 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:23 +1000
Subject: md/raid10: simplify/reindent some loops.

When a loop ends with a large if, it can be neater to change the
if to invert the condition and just 'continue'.
Then the body of the if can be indented to a lower level.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 127 +++++++++++++++++++++++++++-------------------------
 1 file changed, 65 insertions(+), 62 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fe6692e6221..c489b5c6ed6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1533,80 +1533,83 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 			sl--;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
-			if (rdev &&
-			    test_bit(In_sync, &rdev->flags)) {
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				if (sync_page_io(rdev,
-						 r10_bio->devs[sl].addr +
-						 sect,
-						 s<<9, conf->tmppage, WRITE, false)
-				    == 0) {
-					/* Well, this device is dead */
-					printk(KERN_NOTICE
-					       "md/raid10:%s: read correction "
-					       "write failed"
-					       " (%d sectors at %llu on %s)\n",
-					       mdname(mddev), s,
-					       (unsigned long long)(
-						       sect + rdev->data_offset),
-					       bdevname(rdev->bdev, b));
-					printk(KERN_NOTICE "md/raid10:%s: %s: failing "
-					       "drive\n",
-					       mdname(mddev),
-					       bdevname(rdev->bdev, b));
-					md_error(mddev, rdev);
-				}
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
+			if (!rdev ||
+			    !test_bit(In_sync, &rdev->flags))
+				continue;
+
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+			if (sync_page_io(rdev,
+					 r10_bio->devs[sl].addr +
+					 sect,
+					 s<<9, conf->tmppage, WRITE, false)
+			    == 0) {
+				/* Well, this device is dead */
+				printk(KERN_NOTICE
+				       "md/raid10:%s: read correction "
+				       "write failed"
+				       " (%d sectors at %llu on %s)\n",
+				       mdname(mddev), s,
+				       (unsigned long long)(
+					       sect + rdev->data_offset),
+				       bdevname(rdev->bdev, b));
+				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
+				       "drive\n",
+				       mdname(mddev),
+				       bdevname(rdev->bdev, b));
+				md_error(mddev, rdev);
 			}
+			rdev_dec_pending(rdev, mddev);
+			rcu_read_lock();
 		}
 		sl = start;
 		while (sl != r10_bio->read_slot) {
+			char b[BDEVNAME_SIZE];
 
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
-			if (rdev &&
-			    test_bit(In_sync, &rdev->flags)) {
-				char b[BDEVNAME_SIZE];
-				atomic_inc(&rdev->nr_pending);
-				rcu_read_unlock();
-				if (sync_page_io(rdev,
-						 r10_bio->devs[sl].addr +
-						 sect,
-						 s<<9, conf->tmppage,
-						 READ, false) == 0) {
-					/* Well, this device is dead */
-					printk(KERN_NOTICE
-					       "md/raid10:%s: unable to read back "
-					       "corrected sectors"
-					       " (%d sectors at %llu on %s)\n",
-					       mdname(mddev), s,
-					       (unsigned long long)(
-						       sect + rdev->data_offset),
-					       bdevname(rdev->bdev, b));
-					printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
-					       mdname(mddev),
-					       bdevname(rdev->bdev, b));
-
-					md_error(mddev, rdev);
-				} else {
-					printk(KERN_INFO
-					       "md/raid10:%s: read error corrected"
-					       " (%d sectors at %llu on %s)\n",
-					       mdname(mddev), s,
-					       (unsigned long long)(
-						       sect + rdev->data_offset),
-					       bdevname(rdev->bdev, b));
-					atomic_add(s, &rdev->corrected_errors);
-				}
+			if (!rdev ||
+			    !test_bit(In_sync, &rdev->flags))
+				continue;
 
-				rdev_dec_pending(rdev, mddev);
-				rcu_read_lock();
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+			if (sync_page_io(rdev,
+					 r10_bio->devs[sl].addr +
+					 sect,
+					 s<<9, conf->tmppage,
+					 READ, false) == 0) {
+				/* Well, this device is dead */
+				printk(KERN_NOTICE
+				       "md/raid10:%s: unable to read back "
+				       "corrected sectors"
+				       " (%d sectors at %llu on %s)\n",
+				       mdname(mddev), s,
+				       (unsigned long long)(
+					       sect + rdev->data_offset),
+				       bdevname(rdev->bdev, b));
+				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
+				       "drive\n",
+				       mdname(mddev),
+				       bdevname(rdev->bdev, b));
+
+				md_error(mddev, rdev);
+			} else {
+				printk(KERN_INFO
+				       "md/raid10:%s: read error corrected"
+				       " (%d sectors at %llu on %s)\n",
+				       mdname(mddev), s,
+				       (unsigned long long)(
+					       sect + rdev->data_offset),
+				       bdevname(rdev->bdev, b));
+				atomic_add(s, &rdev->corrected_errors);
 			}
+
+			rdev_dec_pending(rdev, mddev);
+			rcu_read_lock();
 		}
 		rcu_read_unlock();
 
-- 
cgit v1.2.3-70-g09d2


From 560f8e5532d63a314271bfb99d3d1d53c938ed14 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:23 +1000
Subject: md/raid10: Split handle_read_error out from raid10d.

raid10d() is too big and is about to get bigger, so split
handle_read_error() out as a separate function.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 123 ++++++++++++++++++++++++++++------------------------
 1 file changed, 66 insertions(+), 57 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c489b5c6ed6..f1b749c2171 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1618,21 +1618,81 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 	}
 }
 
+static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
+{
+	int slot = r10_bio->read_slot;
+	int mirror = r10_bio->devs[slot].devnum;
+	struct bio *bio;
+	conf_t *conf = mddev->private;
+	mdk_rdev_t *rdev;
+	char b[BDEVNAME_SIZE];
+	unsigned long do_sync;
+
+	/* we got a read error. Maybe the drive is bad.  Maybe just
+	 * the block and we can fix it.
+	 * We freeze all other IO, and try reading the block from
+	 * other devices.  When we find one, we re-write
+	 * and check it that fixes the read error.
+	 * This is all done synchronously while the array is
+	 * frozen.
+	 */
+	if (mddev->ro == 0) {
+		freeze_array(conf);
+		fix_read_error(conf, mddev, r10_bio);
+		unfreeze_array(conf);
+	}
+	rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
+
+	bio = r10_bio->devs[slot].bio;
+	r10_bio->devs[slot].bio =
+		mddev->ro ? IO_BLOCKED : NULL;
+	mirror = read_balance(conf, r10_bio);
+	if (mirror == -1) {
+		printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
+		       " read error for block %llu\n",
+		       mdname(mddev),
+		       bdevname(bio->bi_bdev, b),
+		       (unsigned long long)r10_bio->sector);
+		raid_end_bio_io(r10_bio);
+		bio_put(bio);
+		return;
+	}
+
+	do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
+	bio_put(bio);
+	slot = r10_bio->read_slot;
+	rdev = conf->mirrors[mirror].rdev;
+	printk_ratelimited(
+		KERN_ERR
+		"md/raid10:%s: %s: redirecting"
+		"sector %llu to another mirror\n",
+		mdname(mddev),
+		bdevname(rdev->bdev, b),
+		(unsigned long long)r10_bio->sector);
+	bio = bio_clone_mddev(r10_bio->master_bio,
+			      GFP_NOIO, mddev);
+	r10_bio->devs[slot].bio = bio;
+	bio->bi_sector = r10_bio->devs[slot].addr
+		+ rdev->data_offset;
+	bio->bi_bdev = rdev->bdev;
+	bio->bi_rw = READ | do_sync;
+	bio->bi_private = r10_bio;
+	bio->bi_end_io = raid10_end_read_request;
+	generic_make_request(bio);
+}
+
 static void raid10d(mddev_t *mddev)
 {
 	r10bio_t *r10_bio;
-	struct bio *bio;
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	mdk_rdev_t *rdev;
 	struct blk_plug plug;
 
 	md_check_recovery(mddev);
 
 	blk_start_plug(&plug);
 	for (;;) {
-		char b[BDEVNAME_SIZE];
 
 		flush_pending_writes(conf);
 
@@ -1652,60 +1712,9 @@ static void raid10d(mddev_t *mddev)
 			sync_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
-		else {
-			int slot = r10_bio->read_slot;
-			int mirror = r10_bio->devs[slot].devnum;
-			/* we got a read error. Maybe the drive is bad.  Maybe just
-			 * the block and we can fix it.
-			 * We freeze all other IO, and try reading the block from
-			 * other devices.  When we find one, we re-write
-			 * and check it that fixes the read error.
-			 * This is all done synchronously while the array is
-			 * frozen.
-			 */
-			if (mddev->ro == 0) {
-				freeze_array(conf);
-				fix_read_error(conf, mddev, r10_bio);
-				unfreeze_array(conf);
-			}
-			rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
-
-			bio = r10_bio->devs[slot].bio;
-			r10_bio->devs[slot].bio =
-				mddev->ro ? IO_BLOCKED : NULL;
-			mirror = read_balance(conf, r10_bio);
-			if (mirror == -1) {
-				printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
-				       " read error for block %llu\n",
-				       mdname(mddev),
-				       bdevname(bio->bi_bdev,b),
-				       (unsigned long long)r10_bio->sector);
-				raid_end_bio_io(r10_bio);
-				bio_put(bio);
-			} else {
-				const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
-				bio_put(bio);
-				slot = r10_bio->read_slot;
-				rdev = conf->mirrors[mirror].rdev;
-				printk_ratelimited(
-					KERN_ERR
-					"md/raid10:%s: %s: redirecting"
-					"sector %llu to another mirror\n",
-					mdname(mddev),
-					bdevname(rdev->bdev, b),
-					(unsigned long long)r10_bio->sector);
-				bio = bio_clone_mddev(r10_bio->master_bio,
-						      GFP_NOIO, mddev);
-				r10_bio->devs[slot].bio = bio;
-				bio->bi_sector = r10_bio->devs[slot].addr
-					+ rdev->data_offset;
-				bio->bi_bdev = rdev->bdev;
-				bio->bi_rw = READ | do_sync;
-				bio->bi_private = r10_bio;
-				bio->bi_end_io = raid10_end_read_request;
-				generic_make_request(bio);
-			}
-		}
+		else
+			handle_read_error(mddev, r10_bio);
+
 		cond_resched();
 		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
 			md_check_recovery(mddev);
-- 
cgit v1.2.3-70-g09d2


From 856e08e23762dfb92ffc68fd0a8d228f9e152160 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:23 +1000
Subject: md/raid10: avoid reading from known bad blocks - part 1

This patch just covers the basic read path:
 1/ read_balance needs to check for badblocks, and return not only
    the chosen slot, but also how many good blocks are available
    there.
 2/ read submission must be ready to issue multiple reads to
    different devices as different bad blocks on different devices
    could mean that a single large read cannot be served by any one
    device, but can still be served by the array.
    This requires keeping count of the number of outstanding requests
    per bio.  This count is stored in 'bi_phys_segments'

On read error we currently just fail the request if another target
cannot handle the whole request.  Next patch refines that a bit.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid10.h |   4 ++
 2 files changed, 129 insertions(+), 16 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f1b749c2171..872bf948f33 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
 {
 	conf_t *conf = r10_bio->mddev->private;
 
-	/*
-	 * Wake up any possible resync thread that waits for the device
-	 * to go idle.
-	 */
-	allow_barrier(conf);
-
 	put_all_bios(conf, r10_bio);
 	mempool_free(r10_bio, conf->r10bio_pool);
 }
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
 static void raid_end_bio_io(r10bio_t *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
+	int done;
+	conf_t *conf = r10_bio->mddev->private;
 
-	bio_endio(bio,
-		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+	if (bio->bi_phys_segments) {
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		bio->bi_phys_segments--;
+		done = (bio->bi_phys_segments == 0);
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+	} else
+		done = 1;
+	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	if (done) {
+		bio_endio(bio, 0);
+		/*
+		 * Wake up any possible resync thread that waits for the device
+		 * to go idle.
+		 */
+		allow_barrier(conf);
+	}
 	free_r10bio(r10_bio);
 }
 
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
 				   mdname(conf->mddev),
 				   bdevname(conf->mirrors[dev].rdev->bdev, b),
 				   (unsigned long long)r10_bio->sector);
+		set_bit(R10BIO_ReadError, &r10_bio->state);
 		reschedule_retry(r10_bio);
 	}
 }
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
  * FIXME: possibly should rethink readbalancing and do it differently
  * depending on near_copies / far_copies geometry.
  */
-static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
 {
 	const sector_t this_sector = r10_bio->sector;
 	int disk, slot;
-	const int sectors = r10_bio->sectors;
+	int sectors = r10_bio->sectors;
+	int best_good_sectors;
 	sector_t new_distance, best_dist;
 	mdk_rdev_t *rdev;
 	int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 	raid10_find_phys(conf, r10_bio);
 	rcu_read_lock();
 retry:
+	sectors = r10_bio->sectors;
 	best_slot = -1;
 	best_dist = MaxSector;
+	best_good_sectors = 0;
 	do_balance = 1;
 	/*
 	 * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
 		do_balance = 0;
 
 	for (slot = 0; slot < conf->copies ; slot++) {
+		sector_t first_bad;
+		int bad_sectors;
+		sector_t dev_sector;
+
 		if (r10_bio->devs[slot].bio == IO_BLOCKED)
 			continue;
 		disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
 		if (!test_bit(In_sync, &rdev->flags))
 			continue;
 
+		dev_sector = r10_bio->devs[slot].addr;
+		if (is_badblock(rdev, dev_sector, sectors,
+				&first_bad, &bad_sectors)) {
+			if (best_dist < MaxSector)
+				/* Already have a better slot */
+				continue;
+			if (first_bad <= dev_sector) {
+				/* Cannot read here.  If this is the
+				 * 'primary' device, then we must not read
+				 * beyond 'bad_sectors' from another device.
+				 */
+				bad_sectors -= (dev_sector - first_bad);
+				if (!do_balance && sectors > bad_sectors)
+					sectors = bad_sectors;
+				if (best_good_sectors > sectors)
+					best_good_sectors = sectors;
+			} else {
+				sector_t good_sectors =
+					first_bad - dev_sector;
+				if (good_sectors > best_good_sectors) {
+					best_good_sectors = good_sectors;
+					best_slot = slot;
+				}
+				if (!do_balance)
+					/* Must read from here */
+					break;
+			}
+			continue;
+		} else
+			best_good_sectors = sectors;
+
 		if (!do_balance)
 			break;
 
@@ -582,6 +633,7 @@ retry:
 	} else
 		disk = -1;
 	rcu_read_unlock();
+	*max_sectors = best_good_sectors;
 
 	return disk;
 }
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	r10_bio->sector = bio->bi_sector;
 	r10_bio->state = 0;
 
+	/* We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r10_bio and no locking
+	 * will be needed when the request completes.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
 	if (rw == READ) {
 		/*
 		 * read balancing logic:
 		 */
-		int disk = read_balance(conf, r10_bio);
-		int slot = r10_bio->read_slot;
+		int max_sectors;
+		int disk;
+		int slot;
+
+read_again:
+		disk = read_balance(conf, r10_bio, &max_sectors);
+		slot = r10_bio->read_slot;
 		if (disk < 0) {
 			raid_end_bio_io(r10_bio);
 			return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		mirror = conf->mirrors + disk;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
+			    max_sectors);
 
 		r10_bio->devs[slot].bio = read_bio;
 
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r10_bio;
 
-		generic_make_request(read_bio);
+		if (max_sectors < r10_bio->sectors) {
+			/* Could not read all from this device, so we will
+			 * need another r10_bio.
+			 */
+			int sectors_handled;
+
+			sectors_handled = (r10_bio->sectors + max_sectors
+					   - bio->bi_sector);
+			r10_bio->sectors = max_sectors;
+			spin_lock_irq(&conf->device_lock);
+			if (bio->bi_phys_segments == 0)
+				bio->bi_phys_segments = 2;
+			else
+				bio->bi_phys_segments++;
+			spin_unlock(&conf->device_lock);
+			/* Cannot call generic_make_request directly
+			 * as that will be queued in __generic_make_request
+			 * and subsequent mempool_alloc might block
+			 * waiting for it.  so hand bio over to raid10d.
+			 */
+			reschedule_retry(r10_bio);
+
+			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+			r10_bio->master_bio = bio;
+			r10_bio->sectors = ((bio->bi_size >> 9)
+					    - sectors_handled);
+			r10_bio->state = 0;
+			r10_bio->mddev = mddev;
+			r10_bio->sector = bio->bi_sector + sectors_handled;
+			goto read_again;
+		} else
+			generic_make_request(read_bio);
 		return 0;
 	}
 
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 	mdk_rdev_t *rdev;
 	char b[BDEVNAME_SIZE];
 	unsigned long do_sync;
+	int max_sectors;
 
 	/* we got a read error. Maybe the drive is bad.  Maybe just
 	 * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 	bio = r10_bio->devs[slot].bio;
 	r10_bio->devs[slot].bio =
 		mddev->ro ? IO_BLOCKED : NULL;
-	mirror = read_balance(conf, r10_bio);
-	if (mirror == -1) {
+	mirror = read_balance(conf, r10_bio, &max_sectors);
+	if (mirror == -1 || max_sectors < r10_bio->sectors) {
 		printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
 		       " read error for block %llu\n",
 		       mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
 			sync_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
-		else
+		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
 			handle_read_error(mddev, r10_bio);
+		else {
+			/* just a partial read to be scheduled from a
+			 * separate context
+			 */
+			int slot = r10_bio->read_slot;
+			generic_make_request(r10_bio->devs[slot].bio);
+		}
 
 		cond_resched();
 		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index a485914c48c..c646152ba4e 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -124,4 +124,8 @@ struct r10bio_s {
 #define	R10BIO_IsSync	1
 #define	R10BIO_IsRecover 2
 #define	R10BIO_Degraded 3
+/* Set ReadError on bios that experience a read error
+ * so that raid10d knows what to do with them.
+ */
+#define	R10BIO_ReadError 4
 #endif
-- 
cgit v1.2.3-70-g09d2


From 7399c31bc92a26bb8388a73f8e14acadcc512fe5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:23 +1000
Subject: md/raid10: avoid reading from known bad blocks - part 2

When redirecting a read error to a different device, we must
again avoid bad blocks and possibly split the request.

Spin_lock typo fixed thanks to Dan Carpenter <error27@gmail.com>

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 872bf948f33..37801d68a4c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1746,14 +1746,15 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 	rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 
 	bio = r10_bio->devs[slot].bio;
+	bdevname(bio->bi_bdev, b);
 	r10_bio->devs[slot].bio =
 		mddev->ro ? IO_BLOCKED : NULL;
+read_more:
 	mirror = read_balance(conf, r10_bio, &max_sectors);
-	if (mirror == -1 || max_sectors < r10_bio->sectors) {
+	if (mirror == -1) {
 		printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
 		       " read error for block %llu\n",
-		       mdname(mddev),
-		       bdevname(bio->bi_bdev, b),
+		       mdname(mddev), b,
 		       (unsigned long long)r10_bio->sector);
 		raid_end_bio_io(r10_bio);
 		bio_put(bio);
@@ -1761,7 +1762,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 	}
 
 	do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
-	bio_put(bio);
+	if (bio)
+		bio_put(bio);
 	slot = r10_bio->read_slot;
 	rdev = conf->mirrors[mirror].rdev;
 	printk_ratelimited(
@@ -1773,6 +1775,9 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 		(unsigned long long)r10_bio->sector);
 	bio = bio_clone_mddev(r10_bio->master_bio,
 			      GFP_NOIO, mddev);
+	md_trim_bio(bio,
+		    r10_bio->sector - bio->bi_sector,
+		    max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	bio->bi_sector = r10_bio->devs[slot].addr
 		+ rdev->data_offset;
@@ -1780,7 +1785,37 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 	bio->bi_rw = READ | do_sync;
 	bio->bi_private = r10_bio;
 	bio->bi_end_io = raid10_end_read_request;
-	generic_make_request(bio);
+	if (max_sectors < r10_bio->sectors) {
+		/* Drat - have to split this up more */
+		struct bio *mbio = r10_bio->master_bio;
+		int sectors_handled =
+			r10_bio->sector + max_sectors
+			- mbio->bi_sector;
+		r10_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (mbio->bi_phys_segments == 0)
+			mbio->bi_phys_segments = 2;
+		else
+			mbio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		generic_make_request(bio);
+		bio = NULL;
+
+		r10_bio = mempool_alloc(conf->r10bio_pool,
+					GFP_NOIO);
+		r10_bio->master_bio = mbio;
+		r10_bio->sectors = (mbio->bi_size >> 9)
+			- sectors_handled;
+		r10_bio->state = 0;
+		set_bit(R10BIO_ReadError,
+			&r10_bio->state);
+		r10_bio->mddev = mddev;
+		r10_bio->sector = mbio->bi_sector
+			+ sectors_handled;
+
+		goto read_more;
+	} else
+		generic_make_request(bio);
 }
 
 static void raid10d(mddev_t *mddev)
-- 
cgit v1.2.3-70-g09d2


From 8dbed5cebdf6796bf2618457b3653cf820934366 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:24 +1000
Subject: md/raid10 - avoid reading from known bad blocks - part 3

When attempting to repair a read error, don't read from
devices with a known bad block.

As we are only reading PAGE_SIZE blocks, we don't try to
narrow down to smaller regions in the hope that only part of this
page is bad - it isn't worth the effort.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 37801d68a4c..a5ecea2672b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1594,10 +1594,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 
 		rcu_read_lock();
 		do {
+			sector_t first_bad;
+			int bad_sectors;
+
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev &&
-			    test_bit(In_sync, &rdev->flags)) {
+			    test_bit(In_sync, &rdev->flags) &&
+			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
+					&first_bad, &bad_sectors) == 0) {
 				atomic_inc(&rdev->nr_pending);
 				rcu_read_unlock();
 				success = sync_page_io(rdev,
-- 
cgit v1.2.3-70-g09d2


From 40c356ce5ad1a6be817825e1da1bc7494349cc6d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:24 +1000
Subject: md/raid10: avoid reading known bad blocks during resync/recovery.

During resync/recovery limit the size of the request to avoid
reading into a bad block that does not start at-or-before the current
read address.

Similarly if there is a bad block at this address, don't allow the
current request to extend beyond the end of that bad block.

Now that we don't ever read from known bad blocks, it is safe to allow
devices with those blocks into the array.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a5ecea2672b..5f0355832b4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1203,9 +1203,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (rdev->badblocks.count)
-		return -EINVAL;
-
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
@@ -1927,7 +1924,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 	int i;
 	int max_sync;
 	sector_t sync_blocks;
-
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
 
@@ -2070,10 +2066,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 
 			for (j=0; j<conf->copies;j++) {
 				int d = r10_bio->devs[j].devnum;
+				mdk_rdev_t *rdev;
+				sector_t sector, first_bad;
+				int bad_sectors;
 				if (!conf->mirrors[d].rdev ||
 				    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
 					continue;
 				/* This is where we read from */
+				rdev = conf->mirrors[d].rdev;
+				sector = r10_bio->devs[j].addr;
+
+				if (is_badblock(rdev, sector, max_sync,
+						&first_bad, &bad_sectors)) {
+					if (first_bad > sector)
+						max_sync = first_bad - sector;
+					else {
+						bad_sectors -= (sector
+								- first_bad);
+						if (max_sync > bad_sectors)
+							max_sync = bad_sectors;
+						continue;
+					}
+				}
 				bio = r10_bio->devs[0].bio;
 				bio->bi_next = biolist;
 				biolist = bio;
@@ -2160,12 +2174,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 
 		for (i=0; i<conf->copies; i++) {
 			int d = r10_bio->devs[i].devnum;
+			sector_t first_bad, sector;
+			int bad_sectors;
+
 			bio = r10_bio->devs[i].bio;
 			bio->bi_end_io = NULL;
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 			if (conf->mirrors[d].rdev == NULL ||
 			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
 				continue;
+			sector = r10_bio->devs[i].addr;
+			if (is_badblock(conf->mirrors[d].rdev,
+					sector, max_sync,
+					&first_bad, &bad_sectors)) {
+				if (first_bad > sector)
+					max_sync = first_bad - sector;
+				else {
+					bad_sectors -= (sector - first_bad);
+					if (max_sync > bad_sectors)
+						max_sync = max_sync;
+					continue;
+				}
+			}
 			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
@@ -2173,7 +2203,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio->bi_rw = READ;
-			bio->bi_sector = r10_bio->devs[i].addr +
+			bio->bi_sector = sector +
 				conf->mirrors[d].rdev->data_offset;
 			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 			count++;
@@ -2431,10 +2461,6 @@ static int run(mddev_t *mddev)
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 
-		if (rdev->badblocks.count) {
-			printk(KERN_ERR "md/raid10: cannot handle bad blocks yet\n");
-			goto out_free_conf;
-		}
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
-- 
cgit v1.2.3-70-g09d2


From e875ecea266a543e643b19e44cf472f1412708f9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:24 +1000
Subject: md/raid10 record bad blocks as needed during recovery.

When recovering one or more devices, if all the good devices have
bad blocks we should record a bad block on the device being rebuilt.

If this fails, we need to abort the recovery.

To ensure we don't think that we aborted later than we actually did,
we need to move the check for MD_RECOVERY_INTR earlier in md_do_sync,
in particular before mddev->curr_resync is updated.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     |  9 ++++-----
 drivers/md/raid10.c | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a6b6471da2b..d97a6253479 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7165,11 +7165,14 @@ void md_do_sync(mddev_t *mddev)
 			atomic_add(sectors, &mddev->recovery_active);
 		}
 
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			break;
+
 		j += sectors;
 		if (j>1) mddev->curr_resync = j;
 		mddev->curr_mark_cnt = io_sectors;
 		if (last_check == 0)
-			/* this is the earliers that rebuilt will be
+			/* this is the earliest that rebuild will be
 			 * visible in /proc/mdstat
 			 */
 			md_new_event(mddev);
@@ -7178,10 +7181,6 @@ void md_do_sync(mddev_t *mddev)
 			continue;
 
 		last_check = io_sectors;
-
-		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-			break;
-
 	repeat:
 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
 			/* step marks */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5f0355832b4..de608992627 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2005,7 +2005,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
-		int j, k;
+		int j;
 		r10_bio = NULL;
 
 		for (i=0 ; i<conf->raid_disks; i++) {
@@ -2013,6 +2013,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 			r10bio_t *rb2;
 			sector_t sect;
 			int must_sync;
+			int any_working;
 
 			if (conf->mirrors[i].rdev == NULL ||
 			    test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 
@@ -2064,7 +2065,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 			must_sync = bitmap_start_sync(mddev->bitmap, sect,
 						      &sync_blocks, still_degraded);
 
+			any_working = 0;
 			for (j=0; j<conf->copies;j++) {
+				int k;
 				int d = r10_bio->devs[j].devnum;
 				mdk_rdev_t *rdev;
 				sector_t sector, first_bad;
@@ -2073,6 +2076,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 				    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
 					continue;
 				/* This is where we read from */
+				any_working = 1;
 				rdev = conf->mirrors[d].rdev;
 				sector = r10_bio->devs[j].addr;
 
@@ -2121,16 +2125,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 				break;
 			}
 			if (j == conf->copies) {
-				/* Cannot recover, so abort the recovery */
+				/* Cannot recover, so abort the recovery or
+				 * record a bad block */
 				put_buf(r10_bio);
 				if (rb2)
 					atomic_dec(&rb2->remaining);
 				r10_bio = rb2;
-				if (!test_and_set_bit(MD_RECOVERY_INTR,
-						      &mddev->recovery))
-					printk(KERN_INFO "md/raid10:%s: insufficient "
-					       "working devices for recovery.\n",
-					       mdname(mddev));
+				if (any_working) {
+					/* problem is that there are bad blocks
+					 * on other device(s)
+					 */
+					int k;
+					for (k = 0; k < conf->copies; k++)
+						if (r10_bio->devs[k].devnum == i)
+							break;
+					if (!rdev_set_badblocks(
+						    conf->mirrors[i].rdev,
+						    r10_bio->devs[k].addr,
+						    max_sync, 0))
+						any_working = 0;
+				}
+				if (!any_working)  {
+					if (!test_and_set_bit(MD_RECOVERY_INTR,
+							      &mddev->recovery))
+						printk(KERN_INFO "md/raid10:%s: insufficient "
+						       "working devices for recovery.\n",
+						       mdname(mddev));
+					conf->mirrors[i].recovery_disabled
+						= mddev->recovery_disabled;
+				}
 				break;
 			}
 		}
@@ -2290,7 +2313,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 	return sectors_skipped + nr_sectors;
  giveup:
 	/* There is nowhere to write, so all non-sync
-	 * drives must be failed, so try the next chunk...
+	 * drives must be failed or in resync, all drives
+	 * have a bad block, so try the next chunk...
 	 */
 	if (sector_nr + max_sync < max_sector)
 		max_sector = sector_nr + max_sync;
-- 
cgit v1.2.3-70-g09d2


From d4432c23be957ff061f7b23fd60e8506cb472a55 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:24 +1000
Subject: md/raid10: avoid writing to known bad blocks on known bad drives.

Writing to known bad blocks on drives that have seen a write error
is asking for trouble.  So try to avoid these blocks.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 93 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index de608992627..13077a3fd7d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -807,6 +807,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
 	int plugged;
+	int sectors_handled;
+	int max_sectors;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
@@ -895,7 +897,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		/*
 		 * read balancing logic:
 		 */
-		int max_sectors;
 		int disk;
 		int slot;
 
@@ -925,8 +926,6 @@ read_again:
 			/* Could not read all from this device, so we will
 			 * need another r10_bio.
 			 */
-			int sectors_handled;
-
 			sectors_handled = (r10_bio->sectors + max_sectors
 					   - bio->bi_sector);
 			r10_bio->sectors = max_sectors;
@@ -963,13 +962,22 @@ read_again:
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
+	 * If there are known/acknowledged bad blocks on any device
+	 * on which we have seen a write error, we want to avoid
+	 * writing to those blocks.  This potentially requires several
+	 * writes to write around the bad blocks.  Each set of writes
+	 * gets its own r10_bio with a set of bios attached.  The number
+	 * of r10_bios is recored in bio->bi_phys_segments just as with
+	 * the read case.
 	 */
 	plugged = mddev_check_plugged(mddev);
 
 	raid10_find_phys(conf, r10_bio);
- retry_write:
+retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
+	max_sectors = r10_bio->sectors;
+
 	for (i = 0;  i < conf->copies; i++) {
 		int d = r10_bio->devs[i].devnum;
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -978,13 +986,55 @@ read_again:
 			blocked_rdev = rdev;
 			break;
 		}
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			atomic_inc(&rdev->nr_pending);
-			r10_bio->devs[i].bio = bio;
-		} else {
-			r10_bio->devs[i].bio = NULL;
+		r10_bio->devs[i].bio = NULL;
+		if (!rdev || test_bit(Faulty, &rdev->flags)) {
 			set_bit(R10BIO_Degraded, &r10_bio->state);
+			continue;
+		}
+		if (test_bit(WriteErrorSeen, &rdev->flags)) {
+			sector_t first_bad;
+			sector_t dev_sector = r10_bio->devs[i].addr;
+			int bad_sectors;
+			int is_bad;
+
+			is_bad = is_badblock(rdev, dev_sector,
+					     max_sectors,
+					     &first_bad, &bad_sectors);
+			if (is_bad < 0) {
+				/* Mustn't write here until the bad block
+				 * is acknowledged
+				 */
+				atomic_inc(&rdev->nr_pending);
+				set_bit(BlockedBadBlocks, &rdev->flags);
+				blocked_rdev = rdev;
+				break;
+			}
+			if (is_bad && first_bad <= dev_sector) {
+				/* Cannot write here at all */
+				bad_sectors -= (dev_sector - first_bad);
+				if (bad_sectors < max_sectors)
+					/* Mustn't write more than bad_sectors
+					 * to other devices yet
+					 */
+					max_sectors = bad_sectors;
+				/* We don't set R10BIO_Degraded as that
+				 * only applies if the disk is missing,
+				 * so it might be re-added, and we want to
+				 * know to recover this chunk.
+				 * In this case the device is here, and the
+				 * fact that this chunk is not in-sync is
+				 * recorded in the bad block log.
+				 */
+				continue;
+			}
+			if (is_bad) {
+				int good_sectors = first_bad - dev_sector;
+				if (good_sectors < max_sectors)
+					max_sectors = good_sectors;
+			}
 		}
+		r10_bio->devs[i].bio = bio;
+		atomic_inc(&rdev->nr_pending);
 	}
 	rcu_read_unlock();
 
@@ -1004,8 +1054,22 @@ read_again:
 		goto retry_write;
 	}
 
+	if (max_sectors < r10_bio->sectors) {
+		/* We are splitting this into multiple parts, so
+		 * we need to prepare for allocating another r10_bio.
+		 */
+		r10_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+	}
+	sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+
 	atomic_set(&r10_bio->remaining, 1);
-	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
+	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
@@ -1014,10 +1078,12 @@ read_again:
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+			    max_sectors);
 		r10_bio->devs[i].bio = mbio;
 
-		mbio->bi_sector	= r10_bio->devs[i].addr+
-			conf->mirrors[d].rdev->data_offset;
+		mbio->bi_sector	= (r10_bio->devs[i].addr+
+				   conf->mirrors[d].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
 		mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1042,6 +1108,21 @@ read_again:
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
+	if (sectors_handled < (bio->bi_size >> 9)) {
+		/* We need another r1_bio.  It has already been counted
+		 * in bio->bi_phys_segments.
+		 */
+		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+		r10_bio->master_bio = bio;
+		r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+
+		r10_bio->mddev = mddev;
+		r10_bio->sector = bio->bi_sector + sectors_handled;
+		r10_bio->state = 0;
+		goto retry_write;
+	}
+
 	if (do_sync || !mddev->bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 749c55e942d91cb27045fe2eb313aa5afe68ae0b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:24 +1000
Subject: md/raid10: clear bad-block record when write succeeds.

If we succeed in writing to a block that was recorded as
being bad, we clear the bad-block record.

This requires some delayed handling as the bad-block-list update has
to happen in process-context.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 101 +++++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/raid10.h |  11 ++++++
 2 files changed, 100 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 13077a3fd7d..39b2058845f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -181,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 	for (i = 0; i < conf->copies; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
-		if (*bio && *bio != IO_BLOCKED)
+		if (!BIO_SPECIAL(*bio))
 			bio_put(*bio);
 		*bio = NULL;
 	}
@@ -267,7 +267,8 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 /*
  * Find the disk number which triggered given bio
  */
-static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, struct bio *bio)
+static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
+			 struct bio *bio, int *slotp)
 {
 	int slot;
 
@@ -278,6 +279,8 @@ static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, struct bio *bio)
 	BUG_ON(slot == conf->copies);
 	update_head_pos(slot, r10_bio);
 
+	if (slotp)
+		*slotp = slot;
 	return r10_bio->devs[slot].devnum;
 }
 
@@ -329,9 +332,11 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t *r10_bio = bio->bi_private;
 	int dev;
+	int dec_rdev = 1;
 	conf_t *conf = r10_bio->mddev->private;
+	int slot;
 
-	dev = find_bio_disk(conf, r10_bio, bio);
+	dev = find_bio_disk(conf, r10_bio, bio, &slot);
 
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
@@ -340,7 +345,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
 		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
 		/* an I/O failed, we can't clear the bitmap */
 		set_bit(R10BIO_Degraded, &r10_bio->state);
-	} else
+	} else {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
@@ -350,8 +355,23 @@ static void raid10_end_write_request(struct bio *bio, int error)
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
+		sector_t first_bad;
+		int bad_sectors;
+
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 
+		/* Maybe we can clear some bad blocks. */
+		if (is_badblock(conf->mirrors[dev].rdev,
+				r10_bio->devs[slot].addr,
+				r10_bio->sectors,
+				&first_bad, &bad_sectors)) {
+			bio_put(bio);
+			r10_bio->devs[slot].bio = IO_MADE_GOOD;
+			dec_rdev = 0;
+			set_bit(R10BIO_MadeGood, &r10_bio->state);
+		}
+	}
+
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
@@ -364,10 +384,13 @@ static void raid10_end_write_request(struct bio *bio, int error)
 				!test_bit(R10BIO_Degraded, &r10_bio->state),
 				0);
 		md_write_end(r10_bio->mddev);
-		raid_end_bio_io(r10_bio);
+		if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+			reschedule_retry(r10_bio);
+		else
+			raid_end_bio_io(r10_bio);
 	}
-
-	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+	if (dec_rdev)
+		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 
 
@@ -1382,7 +1405,7 @@ static void end_sync_read(struct bio *bio, int error)
 	conf_t *conf = r10_bio->mddev->private;
 	int d;
 
-	d = find_bio_disk(conf, r10_bio, bio);
+	d = find_bio_disk(conf, r10_bio, bio, NULL);
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1414,23 +1437,37 @@ static void end_sync_write(struct bio *bio, int error)
 	mddev_t *mddev = r10_bio->mddev;
 	conf_t *conf = mddev->private;
 	int d;
+	sector_t first_bad;
+	int bad_sectors;
+	int slot;
 
-	d = find_bio_disk(conf, r10_bio, bio);
+	d = find_bio_disk(conf, r10_bio, bio, &slot);
 
 	if (!uptodate)
 		md_error(mddev, conf->mirrors[d].rdev);
+	else if (is_badblock(conf->mirrors[d].rdev,
+			     r10_bio->devs[slot].addr,
+			     r10_bio->sectors,
+			     &first_bad, &bad_sectors))
+		set_bit(R10BIO_MadeGood, &r10_bio->state);
 
 	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
 			/* the primary of several recovery bios */
 			sector_t s = r10_bio->sectors;
-			put_buf(r10_bio);
+			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+				reschedule_retry(r10_bio);
+			else
+				put_buf(r10_bio);
 			md_done_sync(mddev, s, 1);
 			break;
 		} else {
 			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
-			put_buf(r10_bio);
+			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+				reschedule_retry(r10_bio);
+			else
+				put_buf(r10_bio);
 			r10_bio = r10_bio2;
 		}
 	}
@@ -1901,6 +1938,44 @@ read_more:
 		generic_make_request(bio);
 }
 
+static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
+{
+	/* Some sort of write request has finished and it
+	 * succeeded in writing where we thought there was a
+	 * bad block.  So forget the bad block.
+	 */
+	int m;
+	mdk_rdev_t *rdev;
+
+	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
+	    test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+		for (m = 0; m < conf->copies; m++)
+			if (r10_bio->devs[m].bio &&
+			    test_bit(BIO_UPTODATE,
+				     &r10_bio->devs[m].bio->bi_flags)) {
+				int dev = r10_bio->devs[m].devnum;
+				rdev = conf->mirrors[dev].rdev;
+				rdev_clear_badblocks(
+					rdev,
+					r10_bio->devs[m].addr,
+					r10_bio->sectors);
+			}
+		put_buf(r10_bio);
+	} else {
+		for (m = 0; m < conf->copies; m++)
+			if (r10_bio->devs[m].bio == IO_MADE_GOOD) {
+				int dev = r10_bio->devs[m].devnum;
+				rdev = conf->mirrors[dev].rdev;
+				rdev_clear_badblocks(
+					rdev,
+					r10_bio->devs[m].addr,
+					r10_bio->sectors);
+				rdev_dec_pending(rdev, conf->mddev);
+			}
+		raid_end_bio_io(r10_bio);
+	}
+}
+
 static void raid10d(mddev_t *mddev)
 {
 	r10bio_t *r10_bio;
@@ -1928,7 +2003,9 @@ static void raid10d(mddev_t *mddev)
 
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R10BIO_IsSync, &r10_bio->state))
+		if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+			handle_write_completed(conf, r10_bio);
+		else if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index c646152ba4e..d8b7f9af92d 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -118,6 +118,13 @@ struct r10bio_s {
  * level, we store IO_BLOCKED in the appropriate 'bios' pointer
  */
 #define IO_BLOCKED ((struct bio*)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
 
 /* bits for r10bio.state */
 #define	R10BIO_Uptodate	0
@@ -128,4 +135,8 @@ struct r10bio_s {
  * so that raid10d knows what to do with them.
  */
 #define	R10BIO_ReadError 4
+/* If a write for this request means we can clear some
+ * known-bad-block records, we set this flag.
+ */
+#define	R10BIO_MadeGood 5
 #endif
-- 
cgit v1.2.3-70-g09d2


From bd870a16c5946d86126f7203db3c73b71de0a1d8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:24 +1000
Subject: md/raid10:  Handle write errors by updating badblock log.

When we get a write error (in the data area, not in metadata),
update the badblock log rather than failing the whole device.

As the write may well be many blocks, we trying writing each
block individually and only log the ones which fail.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 133 +++++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/raid10.h |   1 +
 2 files changed, 117 insertions(+), 17 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 39b2058845f..909450414c6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -327,6 +327,16 @@ static void raid10_end_read_request(struct bio *bio, int error)
 	}
 }
 
+static void close_write(r10bio_t *r10_bio)
+{
+	/* clear the bitmap if all writes complete successfully */
+	bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+			r10_bio->sectors,
+			!test_bit(R10BIO_Degraded, &r10_bio->state),
+			0);
+	md_write_end(r10_bio->mddev);
+}
+
 static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -342,9 +352,9 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	if (!uptodate) {
-		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-		/* an I/O failed, we can't clear the bitmap */
-		set_bit(R10BIO_Degraded, &r10_bio->state);
+		set_bit(WriteErrorSeen,	&conf->mirrors[dev].rdev->flags);
+		set_bit(R10BIO_WriteError, &r10_bio->state);
+		dec_rdev = 0;
 	} else {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
@@ -378,16 +388,15 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		/* clear the bitmap if all writes complete successfully */
-		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
-				r10_bio->sectors,
-				!test_bit(R10BIO_Degraded, &r10_bio->state),
-				0);
-		md_write_end(r10_bio->mddev);
-		if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+		if (test_bit(R10BIO_WriteError, &r10_bio->state))
 			reschedule_retry(r10_bio);
-		else
-			raid_end_bio_io(r10_bio);
+		else {
+			close_write(r10_bio);
+			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+				reschedule_retry(r10_bio);
+			else
+				raid_end_bio_io(r10_bio);
+		}
 	}
 	if (dec_rdev)
 		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
@@ -1839,6 +1848,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 	}
 }
 
+static void bi_complete(struct bio *bio, int error)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static int submit_bio_wait(int rw, struct bio *bio)
+{
+	struct completion event;
+	rw |= REQ_SYNC;
+
+	init_completion(&event);
+	bio->bi_private = &event;
+	bio->bi_end_io = bi_complete;
+	submit_bio(rw, bio);
+	wait_for_completion(&event);
+
+	return test_bit(BIO_UPTODATE, &bio->bi_flags);
+}
+
+static int narrow_write_error(r10bio_t *r10_bio, int i)
+{
+	struct bio *bio = r10_bio->master_bio;
+	mddev_t *mddev = r10_bio->mddev;
+	conf_t *conf = mddev->private;
+	mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
+	/* bio has the data to be written to slot 'i' where
+	 * we just recently had a write error.
+	 * We repeatedly clone the bio and trim down to one block,
+	 * then try the write.  Where the write fails we record
+	 * a bad block.
+	 * It is conceivable that the bio doesn't exactly align with
+	 * blocks.  We must handle this.
+	 *
+	 * We currently own a reference to the rdev.
+	 */
+
+	int block_sectors;
+	sector_t sector;
+	int sectors;
+	int sect_to_write = r10_bio->sectors;
+	int ok = 1;
+
+	if (rdev->badblocks.shift < 0)
+		return 0;
+
+	block_sectors = 1 << rdev->badblocks.shift;
+	sector = r10_bio->sector;
+	sectors = ((r10_bio->sector + block_sectors)
+		   & ~(sector_t)(block_sectors - 1))
+		- sector;
+
+	while (sect_to_write) {
+		struct bio *wbio;
+		if (sectors > sect_to_write)
+			sectors = sect_to_write;
+		/* Write at 'sector' for 'sectors' */
+		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		md_trim_bio(wbio, sector - bio->bi_sector, sectors);
+		wbio->bi_sector = (r10_bio->devs[i].addr+
+				   rdev->data_offset+
+				   (sector - r10_bio->sector));
+		wbio->bi_bdev = rdev->bdev;
+		if (submit_bio_wait(WRITE, wbio) == 0)
+			/* Failure! */
+			ok = rdev_set_badblocks(rdev, sector,
+						sectors, 0)
+				&& ok;
+
+		bio_put(wbio);
+		sect_to_write -= sectors;
+		sector += sectors;
+		sectors = block_sectors;
+	}
+	return ok;
+}
+
 static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 {
 	int slot = r10_bio->read_slot;
@@ -1962,16 +2047,29 @@ static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
 			}
 		put_buf(r10_bio);
 	} else {
-		for (m = 0; m < conf->copies; m++)
-			if (r10_bio->devs[m].bio == IO_MADE_GOOD) {
-				int dev = r10_bio->devs[m].devnum;
-				rdev = conf->mirrors[dev].rdev;
+		for (m = 0; m < conf->copies; m++) {
+			int dev = r10_bio->devs[m].devnum;
+			struct bio *bio = r10_bio->devs[m].bio;
+			rdev = conf->mirrors[dev].rdev;
+			if (bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors);
 				rdev_dec_pending(rdev, conf->mddev);
+			} else if (bio != NULL &&
+				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+				if (!narrow_write_error(r10_bio, m)) {
+					md_error(conf->mddev, rdev);
+					set_bit(R10BIO_Degraded,
+						&r10_bio->state);
+				}
+				rdev_dec_pending(rdev, conf->mddev);
 			}
+		}
+		if (test_bit(R10BIO_WriteError,
+			     &r10_bio->state))
+			close_write(r10_bio);
 		raid_end_bio_io(r10_bio);
 	}
 }
@@ -2003,7 +2101,8 @@ static void raid10d(mddev_t *mddev)
 
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+		if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+		    test_bit(R10BIO_WriteError, &r10_bio->state))
 			handle_write_completed(conf, r10_bio);
 		else if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index d8b7f9af92d..79cb52a0d4a 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -139,4 +139,5 @@ struct r10bio_s {
  * known-bad-block records, we set this flag.
  */
 #define	R10BIO_MadeGood 5
+#define	R10BIO_WriteError 6
 #endif
-- 
cgit v1.2.3-70-g09d2


From f84ee364dd15af11cada1e673f94128f62db189e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:25 +1000
Subject: md/raid10:  attempt to fix read errors during resync/check

We already attempt to fix read errors found during normal IO
and a 'repair' process.
It is best to try to repair them at any time they are found,
so move a test so that during sync and check a read error will
be corrected by over-writing with good data.

If both (all) devices have known bad blocks in the sync section we
won't try to fix even though the bad blocks might not overlap.  That
should be considered later.

Also if we hit a read error during recovery we don't try to fix it.
It would only be possible to fix if there were at least three copies
of data, which is not very common with RAID10.  But it should still
be considered later.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 909450414c6..10415ddfcb4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1541,11 +1541,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 			if (j == vcnt)
 				continue;
 			mddev->resync_mismatches += r10_bio->sectors;
+			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+				/* Don't fix anything. */
+				continue;
 		}
-		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
-			/* Don't fix anything. */
-			continue;
-		/* Ok, we need to write this bio
+		/* Ok, we need to write this bio, either to correct an
+		 * inconsistency or to correct an unreadable block.
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
-- 
cgit v1.2.3-70-g09d2


From 1a0b7cd82657a590f163b090bd9123a3a6b9aae4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:25 +1000
Subject: md/raid10: record bad blocks due to write errors during
 resync/recovery.

If we get a write error during resync/recovery don't fail the device
but instead record a bad block.  If that fails we can then fail the
device.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 10415ddfcb4..e54ff3274ed 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1452,9 +1452,10 @@ static void end_sync_write(struct bio *bio, int error)
 
 	d = find_bio_disk(conf, r10_bio, bio, &slot);
 
-	if (!uptodate)
-		md_error(mddev, conf->mirrors[d].rdev);
-	else if (is_badblock(conf->mirrors[d].rdev,
+	if (!uptodate) {
+		set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
+		set_bit(R10BIO_WriteError, &r10_bio->state);
+	} else if (is_badblock(conf->mirrors[d].rdev,
 			     r10_bio->devs[slot].addr,
 			     r10_bio->sectors,
 			     &first_bad, &bad_sectors))
@@ -1465,7 +1466,8 @@ static void end_sync_write(struct bio *bio, int error)
 		if (r10_bio->master_bio == NULL) {
 			/* the primary of several recovery bios */
 			sector_t s = r10_bio->sectors;
-			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
@@ -1473,7 +1475,8 @@ static void end_sync_write(struct bio *bio, int error)
 			break;
 		} else {
 			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
-			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
@@ -2029,23 +2032,33 @@ static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
 	/* Some sort of write request has finished and it
 	 * succeeded in writing where we thought there was a
 	 * bad block.  So forget the bad block.
+	 * Or possibly if failed and we need to record
+	 * a bad block.
 	 */
 	int m;
 	mdk_rdev_t *rdev;
 
 	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
 	    test_bit(R10BIO_IsRecover, &r10_bio->state)) {
-		for (m = 0; m < conf->copies; m++)
-			if (r10_bio->devs[m].bio &&
-			    test_bit(BIO_UPTODATE,
+		for (m = 0; m < conf->copies; m++) {
+			int dev = r10_bio->devs[m].devnum;
+			rdev = conf->mirrors[dev].rdev;
+			if (r10_bio->devs[m].bio == NULL)
+				continue;
+			if (test_bit(BIO_UPTODATE,
 				     &r10_bio->devs[m].bio->bi_flags)) {
-				int dev = r10_bio->devs[m].devnum;
-				rdev = conf->mirrors[dev].rdev;
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors);
+			} else {
+				if (!rdev_set_badblocks(
+					    rdev,
+					    r10_bio->devs[m].addr,
+					    r10_bio->sectors, 0))
+					md_error(conf->mddev, rdev);
 			}
+		}
 		put_buf(r10_bio);
 	} else {
 		for (m = 0; m < conf->copies; m++) {
-- 
cgit v1.2.3-70-g09d2


From e684e41db3bad44f1262341300b827c0d94ae220 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:25 +1000
Subject: md/raid10: simplify read error handling during recovery.

If a read error is detected during recovery the code currently
fails the read device.
This isn't really necessary.  recovery_request_write will signal
a write error to end_sync_write and it will record a write
error on the destination device which will record a bad block
there or kick it from the array.

So just remove this call to do md_error.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e54ff3274ed..6022e2aaf3a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1418,13 +1418,12 @@ static void end_sync_read(struct bio *bio, int error)
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
-	else {
+	else
+		/* The write handler will notice the lack of
+		 * R10BIO_Uptodate and record any errors etc
+		 */
 		atomic_add(r10_bio->sectors,
 			   &conf->mirrors[d].rdev->corrected_errors);
-		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-			md_error(r10_bio->mddev,
-				 conf->mirrors[d].rdev);
-	}
 
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
-- 
cgit v1.2.3-70-g09d2


From 5e5702898e93eee7d69b6efde109609a89a61001 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:25 +1000
Subject: md/raid10: Handle read errors during recovery better.

Currently when we get a read error during recovery, we simply abort
the recovery.

Instead, repeat the read in page-sized blocks.
On successful reads, write to the target.
On read errors, record a bad block on the destination,
and only if that fails do we abort the recovery.

As we now retry reads we need to know where we read from.  This was in
bi_sector but that can be changed during a read attempt.
So store the correct from_addr and to_addr in the r10_bio for later
access.


Signed-off-by: NeilBrown<neilb@suse.de>
---
 drivers/md/raid10.c | 154 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 121 insertions(+), 33 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6022e2aaf3a..fc9ebbab3f6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1141,7 +1141,7 @@ retry_write:
 	wake_up(&conf->wait_barrier);
 
 	if (sectors_handled < (bio->bi_size >> 9)) {
-		/* We need another r1_bio.  It has already been counted
+		/* We need another r10_bio.  It has already been counted
 		 * in bio->bi_phys_segments.
 		 */
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
@@ -1438,29 +1438,10 @@ static void end_sync_read(struct bio *bio, int error)
 	}
 }
 
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_request(r10bio_t *r10_bio)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t *r10_bio = bio->bi_private;
 	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev->private;
-	int d;
-	sector_t first_bad;
-	int bad_sectors;
-	int slot;
-
-	d = find_bio_disk(conf, r10_bio, bio, &slot);
-
-	if (!uptodate) {
-		set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
-		set_bit(R10BIO_WriteError, &r10_bio->state);
-	} else if (is_badblock(conf->mirrors[d].rdev,
-			     r10_bio->devs[slot].addr,
-			     r10_bio->sectors,
-			     &first_bad, &bad_sectors))
-		set_bit(R10BIO_MadeGood, &r10_bio->state);
 
-	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
 			/* the primary of several recovery bios */
@@ -1484,6 +1465,33 @@ static void end_sync_write(struct bio *bio, int error)
 	}
 }
 
+static void end_sync_write(struct bio *bio, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	r10bio_t *r10_bio = bio->bi_private;
+	mddev_t *mddev = r10_bio->mddev;
+	conf_t *conf = mddev->private;
+	int d;
+	sector_t first_bad;
+	int bad_sectors;
+	int slot;
+
+	d = find_bio_disk(conf, r10_bio, bio, &slot);
+
+	if (!uptodate) {
+		set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
+		set_bit(R10BIO_WriteError, &r10_bio->state);
+	} else if (is_badblock(conf->mirrors[d].rdev,
+			     r10_bio->devs[slot].addr,
+			     r10_bio->sectors,
+			     &first_bad, &bad_sectors))
+		set_bit(R10BIO_MadeGood, &r10_bio->state);
+
+	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+
+	end_sync_request(r10_bio);
+}
+
 /*
  * Note: sync and recover and handled very differently for raid10
  * This code is for resync.
@@ -1600,6 +1608,84 @@ done:
  * The second for writing.
  *
  */
+static void fix_recovery_read_error(r10bio_t *r10_bio)
+{
+	/* We got a read error during recovery.
+	 * We repeat the read in smaller page-sized sections.
+	 * If a read succeeds, write it to the new device or record
+	 * a bad block if we cannot.
+	 * If a read fails, record a bad block on both old and
+	 * new devices.
+	 */
+	mddev_t *mddev = r10_bio->mddev;
+	conf_t *conf = mddev->private;
+	struct bio *bio = r10_bio->devs[0].bio;
+	sector_t sect = 0;
+	int sectors = r10_bio->sectors;
+	int idx = 0;
+	int dr = r10_bio->devs[0].devnum;
+	int dw = r10_bio->devs[1].devnum;
+
+	while (sectors) {
+		int s = sectors;
+		mdk_rdev_t *rdev;
+		sector_t addr;
+		int ok;
+
+		if (s > (PAGE_SIZE>>9))
+			s = PAGE_SIZE >> 9;
+
+		rdev = conf->mirrors[dr].rdev;
+		addr = r10_bio->devs[0].addr + sect,
+		ok = sync_page_io(rdev,
+				  addr,
+				  s << 9,
+				  bio->bi_io_vec[idx].bv_page,
+				  READ, false);
+		if (ok) {
+			rdev = conf->mirrors[dw].rdev;
+			addr = r10_bio->devs[1].addr + sect;
+			ok = sync_page_io(rdev,
+					  addr,
+					  s << 9,
+					  bio->bi_io_vec[idx].bv_page,
+					  WRITE, false);
+			if (!ok)
+				set_bit(WriteErrorSeen, &rdev->flags);
+		}
+		if (!ok) {
+			/* We don't worry if we cannot set a bad block -
+			 * it really is bad so there is no loss in not
+			 * recording it yet
+			 */
+			rdev_set_badblocks(rdev, addr, s, 0);
+
+			if (rdev != conf->mirrors[dw].rdev) {
+				/* need bad block on destination too */
+				mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
+				addr = r10_bio->devs[1].addr + sect;
+				ok = rdev_set_badblocks(rdev2, addr, s, 0);
+				if (!ok) {
+					/* just abort the recovery */
+					printk(KERN_NOTICE
+					       "md/raid10:%s: recovery aborted"
+					       " due to read error\n",
+					       mdname(mddev));
+
+					conf->mirrors[dw].recovery_disabled
+						= mddev->recovery_disabled;
+					set_bit(MD_RECOVERY_INTR,
+						&mddev->recovery);
+					break;
+				}
+			}
+		}
+
+		sectors -= s;
+		sect += s;
+		idx++;
+	}
+}
 
 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
@@ -1607,6 +1693,12 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 	int d;
 	struct bio *wbio;
 
+	if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
+		fix_recovery_read_error(r10_bio);
+		end_sync_request(r10_bio);
+		return;
+	}
+
 	/*
 	 * share the pages with the first bio
 	 * and submit the write request
@@ -1616,16 +1708,7 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 
 	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
-		generic_make_request(wbio);
-	else {
-		printk(KERN_NOTICE
-		       "md/raid10:%s: recovery aborted due to read error\n",
-		       mdname(mddev));
-		conf->mirrors[d].recovery_disabled = mddev->recovery_disabled;
-		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		bio_endio(wbio, 0);
-	}
+	generic_make_request(wbio);
 }
 
 
@@ -2339,6 +2422,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 			for (j=0; j<conf->copies;j++) {
 				int k;
 				int d = r10_bio->devs[j].devnum;
+				sector_t from_addr, to_addr;
 				mdk_rdev_t *rdev;
 				sector_t sector, first_bad;
 				int bad_sectors;
@@ -2368,7 +2452,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio->bi_rw = READ;
-				bio->bi_sector = r10_bio->devs[j].addr +
+				from_addr = r10_bio->devs[j].addr;
+				bio->bi_sector = from_addr +
 					conf->mirrors[d].rdev->data_offset;
 				bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 				atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -2385,12 +2470,15 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio->bi_rw = WRITE;
-				bio->bi_sector = r10_bio->devs[k].addr +
+				to_addr = r10_bio->devs[k].addr;
+				bio->bi_sector = to_addr +
 					conf->mirrors[i].rdev->data_offset;
 				bio->bi_bdev = conf->mirrors[i].rdev->bdev;
 
 				r10_bio->devs[0].devnum = d;
+				r10_bio->devs[0].addr = from_addr;
 				r10_bio->devs[1].devnum = i;
+				r10_bio->devs[1].addr = to_addr;
 
 				break;
 			}
-- 
cgit v1.2.3-70-g09d2


From 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:25 +1000
Subject: md/raid10: handle further errors during fix_read_error better.

If we find more read/write errors we should record a bad block before
failing the device.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 59 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 15 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fc9ebbab3f6..8b29cd4f01c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1749,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
 		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
 }
 
+static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+			    int sectors, struct page *page, int rw)
+{
+	sector_t first_bad;
+	int bad_sectors;
+
+	if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
+	    && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
+		return -1;
+	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+		/* success */
+		return 1;
+	if (rw == WRITE)
+		set_bit(WriteErrorSeen, &rdev->flags);
+	/* need to record an error - either for the block or the device */
+	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+		md_error(rdev->mddev, rdev);
+	return 0;
+}
+
 /*
  * This is a kernel thread which:
  *
@@ -1832,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 		rcu_read_unlock();
 
 		if (!success) {
-			/* Cannot read from anywhere -- bye bye array */
+			/* Cannot read from anywhere, just mark the block
+			 * as bad on the first device to discourage future
+			 * reads.
+			 */
 			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
-			md_error(mddev, conf->mirrors[dn].rdev);
+			rdev = conf->mirrors[dn].rdev;
+
+			if (!rdev_set_badblocks(
+				    rdev,
+				    r10_bio->devs[r10_bio->read_slot].addr
+				    + sect,
+				    s, 0))
+				md_error(mddev, rdev);
 			break;
 		}
 
@@ -1855,10 +1885,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
-			if (sync_page_io(rdev,
-					 r10_bio->devs[sl].addr +
-					 sect,
-					 s<<9, conf->tmppage, WRITE, false)
+			if (r10_sync_page_io(rdev,
+					     r10_bio->devs[sl].addr +
+					     sect,
+					     s<<9, conf->tmppage, WRITE)
 			    == 0) {
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
@@ -1873,7 +1903,6 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 				       "drive\n",
 				       mdname(mddev),
 				       bdevname(rdev->bdev, b));
-				md_error(mddev, rdev);
 			}
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
@@ -1893,11 +1922,12 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
-			if (sync_page_io(rdev,
-					 r10_bio->devs[sl].addr +
-					 sect,
-					 s<<9, conf->tmppage,
-					 READ, false) == 0) {
+			switch (r10_sync_page_io(rdev,
+					     r10_bio->devs[sl].addr +
+					     sect,
+					     s<<9, conf->tmppage,
+						 READ)) {
+			case 0:
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
 				       "md/raid10:%s: unable to read back "
@@ -1911,9 +1941,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 				       "drive\n",
 				       mdname(mddev),
 				       bdevname(rdev->bdev, b));
-
-				md_error(mddev, rdev);
-			} else {
+				break;
+			case 1:
 				printk(KERN_INFO
 				       "md/raid10:%s: read error corrected"
 				       " (%d sectors at %llu on %s)\n",
-- 
cgit v1.2.3-70-g09d2