raid: improve MD/raid10 handling of correctable read errors.

We've noticed severe lasting performance degradation of our raid arrays when we have drives that yield large amounts of media errors. The raid10 module will queue each failed read for retry, and also will attempt call fix_read_error() to perform the read recovery. Read recovery is performed while the array is frozen, so repeated recovery attempts can degrade the performance of the array for extended periods of time. With this patch I propose adding a per md device max number of corrected read attempts. Each rdev will maintain a count of read correction attempts in the rdev->read_errors field (not used currently for raid10). When we enter fix_read_error() we'll check to see when the last read error occurred, and divide the read error count by 2 for every hour since the last read error. If at that point our read error count exceeds the read error threshold, we'll fail the raid device. In addition in this patch I add sysfs nodes (get/set) for the per md max_read_errors attribute, the rdev->read_errors attribute, and added some printk's to indicate when fix_read_error fails to repair an rdev. For testing I used debugfs->fail_make_request to inject IO errors to the rdev while doing IO to the raid array. Signed-off-by: Robert Becker <Rob.Becker@riverbed.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Robert Becker <Rob.Becker@riverbed.com> 2009-12-14 12:49:58 +1100
committer: NeilBrown <neilb@suse.de> 2009-12-14 12:51:41 +1100
commit: 1e50915fe0bbf7a46db0fa7e1e604d3fc95f057d (patch)
tree: 7a722ad6f56c61a6173493f1cd44d809c8b1bd8d /drivers/md/raid10.c
parent: 67b8dc4b06b0e97df55fd76e209f34f9a52e820e (diff)
1 files changed, 74 insertions, 0 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 670449f7411..5c71a462c12 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 
 
 /*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ *
+ */
+static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct timespec cur_time_mon;
+	unsigned long hours_since_last;
+	unsigned int read_errors = atomic_read(&rdev->read_errors);
+
+	ktime_get_ts(&cur_time_mon);
+
+	if (rdev->last_read_error.tv_sec == 0 &&
+	    rdev->last_read_error.tv_nsec == 0) {
+		/* first time we've seen a read error */
+		rdev->last_read_error = cur_time_mon;
+		return;
+	}
+
+	hours_since_last = (cur_time_mon.tv_sec -
+			    rdev->last_read_error.tv_sec) / 3600;
+
+	rdev->last_read_error = cur_time_mon;
+
+	/*
+	 * if hours_since_last is > the number of bits in read_errors
+	 * just set read errors to 0. We do this to avoid
+	 * overflowing the shift of read_errors by hours_since_last.
+	 */
+	if (hours_since_last >= 8 * sizeof(read_errors))
+		atomic_set(&rdev->read_errors, 0);
+	else
+		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+
+/*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
 	mdk_rdev_t*rdev;
+	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+
+	rcu_read_lock();
+	{
+		int d = r10_bio->devs[r10_bio->read_slot].devnum;
+		char b[BDEVNAME_SIZE];
+		int cur_read_error_count = 0;
+
+		rdev = rcu_dereference(conf->mirrors[d].rdev);
+		bdevname(rdev->bdev, b);
+
+		if (test_bit(Faulty, &rdev->flags)) {
+			rcu_read_unlock();
+			/* drive has already been failed, just ignore any
+			   more fix_read_error() attempts */
+			return;
+		}
+
+		check_decay_read_errors(mddev, rdev);
+		atomic_inc(&rdev->read_errors);
+		cur_read_error_count = atomic_read(&rdev->read_errors);
+		if (cur_read_error_count > max_read_errors) {
+			rcu_read_unlock();
+			printk(KERN_NOTICE
+			       "raid10: %s: Raid device exceeded "
+			       "read_error threshold "
+			       "[cur %d:max %d]\n",
+			       b, cur_read_error_count, max_read_errors);
+			printk(KERN_NOTICE
+			       "raid10: %s: Failing raid "
+			       "device\n", b);
+			md_error(mddev, conf->mirrors[d].rdev);
+			return;
+		}
+	}
+	rcu_read_unlock();
+
 	while(sectors) {
 		int s = sectors;
 		int sl = r10_bio->read_slot;
author	Robert Becker <Rob.Becker@riverbed.com>	2009-12-14 12:49:58 +1100
committer	NeilBrown <neilb@suse.de>	2009-12-14 12:51:41 +1100
commit	1e50915fe0bbf7a46db0fa7e1e604d3fc95f057d (patch)
tree	7a722ad6f56c61a6173493f1cd44d809c8b1bd8d /drivers/md/raid10.c
parent	67b8dc4b06b0e97df55fd76e209f34f9a52e820e (diff)