md: Change handling of save_raid_disk and metadata update during recovery.

Since commit d70ed2e4fafdbef0800e739 MD: Allow restarting an interrupted incremental recovery. we don't write out the metadata to devices while they are recovering. This had a good reason, but has unfortunate consequences. This patch changes things to make them work better. At issue is what happens if the array is shut down while a recovery is happening, particularly a bitmap-guided recovery. Ideally the recovery should pick up where it left off. However the metadata cannot represent the state "A recovery is in process which is guided by the bitmap". Before the above mentioned commit, we wrote metadata to the device which said "this is being recovered and it is up to <here>". So after a restart, a full recovery (not bitmap-guided) would happen from where-ever it was up to. After the commit the metadata wasn't updated so it still said "This device is fully in sync with <this> event count". That leads to a bitmap-based recovery following the whole bitmap, which should be a lot less work than a full recovery from some starting point. So this was an improvement. However updates some metadata but not all leads to other problems. In particular, the metadata written to the fully-up-to-date device record that the array has all devices present (even though some are recovering). So on restart, mdadm wants to find all devices and expects them to have current event counts. Obviously it doesn't (some have old event counts) so (when assembling with --incremental) it waits indefinitely for the rest of the expected devices. It really is wrong to not update all the metadata together. Do that is bound to cause confusion. Instead, we should make it possible to record the truth in the metadata. i.e. we need to be able to record that a device is being recovered based on the bitmap. We already have a Feature flag to say that recovery is happening. We now add another one to say that it is a bitmap-based recovery. With this we can remove the code that disables the write-out of metadata on some devices. So this patch: - moves the setting of 'saved_raid_disk' from add_new_disk to the validate_super methods. This makes sure it is always set properly, both when adding a new device to an array, and when assembling an array from a collection of devices. - Adds a metadata flag MD_FEATURE_RECOVERY_BITMAP which is only used if MD_FEATURE_RECOVERY_OFFSET is set, and record that a bitmap-based recovery is allowed. This is only present in v1.x metadata. v0.90 doesn't support devices which are in the middle of recovery at all. - Only skips writing metadata to Faulty devices. - Also allows rdev state to be set to "-insync" via sysfs. This can be used for external-metadata arrays. When the 'role' is set the device is assumed to be in-sync. If, after setting the role, we set the state to "-insync", the role is moved to saved_raid_disk which effectively says the device is partly in-sync with that slot and needs a bitmap recovery. Cc: Andrei Warkentin <andreiw@vmware.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2013-12-09 12:04:56 +1100
committer: NeilBrown <neilb@suse.de> 2014-01-14 16:44:21 +1100
commit: f466722ca614edcd14f3337373f33132117c7612 (patch)
tree: 7089db34f31e5fb991a0727216c358520c23c4b5 /drivers/md
parent: 8313b8e57f55b15e5b7f7fc5d1630bbf686a9a97 (diff)
1 files changed, 23 insertions, 19 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2a456a5d59a..539f08885e7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1183,6 +1183,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 			    desc->raid_disk < mddev->raid_disks */) {
 			set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = desc->raid_disk;
+			rdev->saved_raid_disk = desc->raid_disk;
 		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
 			/* active but not in sync implies recovery up to
 			 * reshape position.  We don't know exactly where
@@ -1681,10 +1682,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(Faulty, &rdev->flags);
 			break;
 		default:
+			rdev->saved_raid_disk = role;
 			if ((le32_to_cpu(sb->feature_map) &
-			     MD_FEATURE_RECOVERY_OFFSET))
+			     MD_FEATURE_RECOVERY_OFFSET)) {
 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
-			else
+				if (!(le32_to_cpu(sb->feature_map) &
+				      MD_FEATURE_RECOVERY_BITMAP))
+					rdev->saved_raid_disk = -1;
+			} else
 				set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = role;
 			break;
@@ -1746,6 +1751,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
 		sb->recovery_offset =
 			cpu_to_le64(rdev->recovery_offset);
+		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
+			sb->feature_map |=
+				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
 	}
 	if (test_bit(Replacement, &rdev->flags))
 		sb->feature_map |=
@@ -2487,8 +2495,7 @@ repeat:
 		if (rdev->sb_loaded != 1)
 			continue; /* no noise on spare devices */
 
-		if (!test_bit(Faulty, &rdev->flags) &&
-		    rdev->saved_raid_disk == -1) {
+		if (!test_bit(Faulty, &rdev->flags)) {
 			md_super_write(mddev,rdev,
 				       rdev->sb_start, rdev->sb_size,
 				       rdev->sb_page);
@@ -2504,11 +2511,9 @@ repeat:
 				rdev->badblocks.size = 0;
 			}
 
-		} else if (test_bit(Faulty, &rdev->flags))
+		} else
 			pr_debug("md: %s (skipping faulty)\n",
 				 bdevname(rdev->bdev, b));
-		else
-			pr_debug("(skipping incremental s/r ");
 
 		if (mddev->level == LEVEL_MULTIPATH)
 			/* only need to write one superblock... */
@@ -2624,6 +2629,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 	 *  blocked - sets the Blocked flags
 	 *  -blocked - clears the Blocked and possibly simulates an error
 	 *  insync - sets Insync providing device isn't active
+	 *  -insync - clear Insync for a device with a slot assigned,
+	 *            so that it gets rebuilt based on bitmap
 	 *  write_error - sets WriteErrorSeen
 	 *  -write_error - clears WriteErrorSeen
 	 */
@@ -2672,6 +2679,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
 		set_bit(In_sync, &rdev->flags);
 		err = 0;
+	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+		clear_bit(In_sync, &rdev->flags);
+		rdev->saved_raid_disk = rdev->raid_disk;
+		rdev->raid_disk = -1;
+		err = 0;
 	} else if (cmd_match(buf, "write_error")) {
 		set_bit(WriteErrorSeen, &rdev->flags);
 		err = 0;
@@ -5780,6 +5792,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 				clear_bit(Bitmap_sync, &rdev->flags);
 			} else
 				rdev->raid_disk = -1;
+			rdev->saved_raid_disk = rdev->raid_disk;
 		} else
 			super_types[mddev->major_version].
 				validate_super(mddev, rdev);
@@ -5792,11 +5805,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 			return -EINVAL;
 		}
 
-		if (test_bit(In_sync, &rdev->flags))
-			rdev->saved_raid_disk = rdev->raid_disk;
-		else
-			rdev->saved_raid_disk = -1;
-
 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
 			set_bit(WriteMostly, &rdev->flags);
@@ -7948,14 +7956,10 @@ void md_reap_sync_thread(struct mddev *mddev)
 		mddev->pers->finish_reshape(mddev);
 
 	/* If array is no-longer degraded, then any saved_raid_disk
-	 * information must be scrapped.  Also if any device is now
-	 * In_sync we must scrape the saved_raid_disk for that device
-	 * do the superblock for an incrementally recovered device
-	 * written out.
+	 * information must be scrapped.
 	 */
-	rdev_for_each(rdev, mddev)
-		if (!mddev->degraded ||
-		    test_bit(In_sync, &rdev->flags))
+	if (!mddev->degraded)
+		rdev_for_each(rdev, mddev)
 			rdev->saved_raid_disk = -1;
 
 	md_update_sb(mddev, 1);
author	NeilBrown <neilb@suse.de>	2013-12-09 12:04:56 +1100
committer	NeilBrown <neilb@suse.de>	2014-01-14 16:44:21 +1100
commit	f466722ca614edcd14f3337373f33132117c7612 (patch)
tree	7089db34f31e5fb991a0727216c358520c23c4b5 /drivers/md
parent	8313b8e57f55b15e5b7f7fc5d1630bbf686a9a97 (diff)