From 60f355ead31e2be8d06ac8acb163df91a1c64e3b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:05 +0000
Subject: dm raid1: hold write bios when errors are handled

Hold all write bios when errors are handled.

Previously the failures list was used only when handling errors with
a userspace daemon such as dmeventd.  Now, it is always used for all bios.
The regions where some writes failed must be marked as nosync. This can only
be done in process context (i.e. in raid1 workqueue), not in the
write_callback function.

Previously the write would succeed if writing to at least one leg
succeeded.  This is wrong because data from the failed leg may be
replicated to the correct leg.  Now, if using a userspace daemon, the
write with some failures will be held until the daemon has done its job
and reconfigured the array.  If not using a daemon, the write still
succeeds if at least one leg succeeds. This is bad, but it is consistent
with current behavior.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 63 ++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4f466ad7568..e363335e8d8 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -578,7 +578,6 @@ static void write_callback(unsigned long error, void *context)
 	unsigned i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
-	int uptodate = 0;
 	int should_wake = 0;
 	unsigned long flags;
 
@@ -591,36 +590,27 @@ static void write_callback(unsigned long error, void *context)
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
-	if (likely(!error))
-		goto out;
+	if (likely(!error)) {
+		bio_endio(bio, ret);
+		return;
+	}
 
 	for (i = 0; i < ms->nr_mirrors; i++)
 		if (test_bit(i, &error))
 			fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
-		else
-			uptodate = 1;
 
-	if (unlikely(!uptodate)) {
-		DMERR("All replicated volumes dead, failing I/O");
-		/* None of the writes succeeded, fail the I/O. */
-		ret = -EIO;
-	} else if (errors_handled(ms)) {
-		/*
-		 * Need to raise event.  Since raising
-		 * events can block, we need to do it in
-		 * the main thread.
-		 */
-		spin_lock_irqsave(&ms->lock, flags);
-		if (!ms->failures.head)
-			should_wake = 1;
-		bio_list_add(&ms->failures, bio);
-		spin_unlock_irqrestore(&ms->lock, flags);
-		if (should_wake)
-			wakeup_mirrord(ms);
-		return;
-	}
-out:
-	bio_endio(bio, ret);
+	/*
+	 * Need to raise event.  Since raising
+	 * events can block, we need to do it in
+	 * the main thread.
+	 */
+	spin_lock_irqsave(&ms->lock, flags);
+	if (!ms->failures.head)
+		should_wake = 1;
+	bio_list_add(&ms->failures, bio);
+	spin_unlock_irqrestore(&ms->lock, flags);
+	if (should_wake)
+		wakeup_mirrord(ms);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -773,15 +763,26 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 	 * for us to treat them the same and requeue them
 	 * as well.
 	 */
-
 	while ((bio = bio_list_pop(failures))) {
-		if (ms->log_failure)
-			hold_bio(ms, bio);
-		else {
+		if (!ms->log_failure) {
 			ms->in_sync = 0;
 			dm_rh_mark_nosync(ms->rh, bio);
-			bio_endio(bio, 0);
 		}
+
+		/*
+		 * If all the legs are dead, fail the I/O.
+		 * If we have been told to handle errors, hold the bio
+		 * and wait for userspace to deal with the problem.
+		 * Otherwise pretend that the I/O succeeded. (This would
+		 * be wrong if the failed leg returned after reboot and
+		 * got replicated back to the good legs.)
+		 */
+		if (!get_valid_mirror(ms))
+			bio_endio(bio, -EIO);
+		else if (errors_handled(ms))
+			hold_bio(ms, bio);
+		else
+			bio_endio(bio, 0);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2