From 6ce328462c1145a217ba1f27b882743be1407759 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 18 Jul 2011 17:38:50 +1000
Subject: md/raid5: use kmem_cache_zalloc()

Replace kmem_cache_alloc + memset(,0,) to kmem_cache_zalloc.
I think it's not harmful since @conf->slab_cache already knows
actual size of struct stripe_head.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b72edf35ec5..0f71aa9a07c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1315,10 +1315,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 static int grow_one_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
-	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
+	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
 	if (!sh)
 		return 0;
-	memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
+
 	sh->raid_conf = conf;
 	spin_lock_init(&sh->lock);
 	#ifdef CONFIG_MULTICORE_RAID456
@@ -1435,12 +1435,10 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 		return -ENOMEM;
 
 	for (i = conf->max_nr_stripes; i; i--) {
-		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+		nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
 		if (!nsh)
 			break;
 
-		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
-
 		nsh->raid_conf = conf;
 		spin_lock_init(&nsh->lock);
 		#ifdef CONFIG_MULTICORE_RAID456
-- 
cgit v1.2.3-70-g09d2


From ffd96e35c16a99fdb490cc5723b8e32135ae5883 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 18 Jul 2011 17:38:51 +1000
Subject: md/raid5: get rid of duplicated call to bio_data_dir()

In raid5::make_request(), once bio_data_dir(@bi) is detected
it never (and couldn't) be changed. Use the result always.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0f71aa9a07c..71480646a45 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4014,7 +4014,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 				}
 			}
 
-			if (bio_data_dir(bi) == WRITE &&
+			if (rw == WRITE &&
 			    logical_sector >= mddev->suspend_lo &&
 			    logical_sector < mddev->suspend_hi) {
 				release_stripe(sh);
@@ -4032,7 +4032,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+			    !add_stripe_bio(sh, bi, dd_idx, rw)) {
 				/* Stripe is busy expanding or
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
-- 
cgit v1.2.3-70-g09d2


From 83206d66b65118d995c38746f21edc2bb8564b49 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:19:49 +1000
Subject: md/raid5: Remove use of sh->lock in sync_request

This is the start of a series of patches to remove sh->lock.

sync_request takes sh->lock before setting STRIPE_SYNCING to ensure
there is no race with testing it in handle_stripe[56].

Instead, use a new flag STRIPE_SYNC_REQUESTED and test it early
in handle_stripe[56] (after getting the same lock) and perform the
same set/clear operations if it was set.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 13 +++++++++----
 drivers/md/raid5.h | 33 ++++++++++++++++++---------------
 2 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 71480646a45..f2f2ab32969 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3022,6 +3022,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		 sh->reconstruct_state);
 
 	spin_lock(&sh->lock);
+	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+		set_bit(STRIPE_SYNCING, &sh->state);
+		clear_bit(STRIPE_INSYNC, &sh->state);
+	}
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
@@ -3313,6 +3317,10 @@ static void handle_stripe6(struct stripe_head *sh)
 	memset(&s, 0, sizeof(s));
 
 	spin_lock(&sh->lock);
+	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+		set_bit(STRIPE_SYNCING, &sh->state);
+		clear_bit(STRIPE_INSYNC, &sh->state);
+	}
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
@@ -4373,10 +4381,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
 	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
 
-	spin_lock(&sh->lock);
-	set_bit(STRIPE_SYNCING, &sh->state);
-	clear_bit(STRIPE_INSYNC, &sh->state);
-	spin_unlock(&sh->lock);
+	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
 
 	handle_stripe(sh);
 	release_stripe(sh);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2613b..a33001137bf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -289,21 +289,24 @@ struct r6_state {
 /*
  * Stripe state
  */
-#define STRIPE_HANDLE		2
-#define	STRIPE_SYNCING		3
-#define	STRIPE_INSYNC		4
-#define	STRIPE_PREREAD_ACTIVE	5
-#define	STRIPE_DELAYED		6
-#define	STRIPE_DEGRADED		7
-#define	STRIPE_BIT_DELAY	8
-#define	STRIPE_EXPANDING	9
-#define	STRIPE_EXPAND_SOURCE	10
-#define	STRIPE_EXPAND_READY	11
-#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
-#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
-#define	STRIPE_BIOFILL_RUN	14
-#define	STRIPE_COMPUTE_RUN	15
-#define	STRIPE_OPS_REQ_PENDING	16
+enum {
+	STRIPE_HANDLE,
+	STRIPE_SYNC_REQUESTED,
+	STRIPE_SYNCING,
+	STRIPE_INSYNC,
+	STRIPE_PREREAD_ACTIVE,
+	STRIPE_DELAYED,
+	STRIPE_DEGRADED,
+	STRIPE_BIT_DELAY,
+	STRIPE_EXPANDING,
+	STRIPE_EXPAND_SOURCE,
+	STRIPE_EXPAND_READY,
+	STRIPE_IO_STARTED,	/* do not count towards 'bypass_count' */
+	STRIPE_FULL_WRITE,	/* all blocks are set to be overwritten */
+	STRIPE_BIOFILL_RUN,
+	STRIPE_COMPUTE_RUN,
+	STRIPE_OPS_REQ_PENDING,
+};
 
 /*
  * Operation request flags
-- 
cgit v1.2.3-70-g09d2


From cbe47ec559c33a68b5ee002051b848d1531a8adb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:20:35 +1000
Subject: md/raid5: Protect some more code with ->device_lock.

Other places that change or follow dev->towrite and dev->written take
the device_lock as well as the sh->lock.
So it should really be held in these places too.
Also, doing so will allow sh->lock to be discarded.

with merged fixes by: Namhyung Kim <namhyung@gmail.com>


Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f2f2ab32969..9985138f4c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1021,10 +1021,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			struct bio *wbi;
 
 			spin_lock(&sh->lock);
+			spin_lock_irq(&sh->raid_conf->device_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
+			spin_unlock_irq(&sh->raid_conf->device_lock);
 			spin_unlock(&sh->lock);
 
 			while (wbi && wbi->bi_sector <
@@ -2141,7 +2143,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	raid5_conf_t *conf = sh->raid_conf;
 	int firstwrite=0;
 
-	pr_debug("adding bh b#%llu to stripe s#%llu\n",
+	pr_debug("adding bi b#%llu to stripe s#%llu\n",
 		(unsigned long long)bi->bi_sector,
 		(unsigned long long)sh->sector);
 
@@ -2167,19 +2169,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		bi->bi_next = *bip;
 	*bip = bi;
 	bi->bi_phys_segments++;
-	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sh->lock);
-
-	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-		(unsigned long long)bi->bi_sector,
-		(unsigned long long)sh->sector, dd_idx);
-
-	if (conf->mddev->bitmap && firstwrite) {
-		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
-				  STRIPE_SECTORS, 0);
-		sh->bm_seq = conf->seq_flush+1;
-		set_bit(STRIPE_BIT_DELAY, &sh->state);
-	}
 
 	if (forwrite) {
 		/* check if page is covered */
@@ -2194,6 +2183,19 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
+	spin_unlock_irq(&conf->device_lock);
+	spin_unlock(&sh->lock);
+
+	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
+		(unsigned long long)(*bip)->bi_sector,
+		(unsigned long long)sh->sector, dd_idx);
+
+	if (conf->mddev->bitmap && firstwrite) {
+		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+				  STRIPE_SECTORS, 0);
+		sh->bm_seq = conf->seq_flush+1;
+		set_bit(STRIPE_BIT_DELAY, &sh->state);
+	}
 	return 1;
 
  overlap:
-- 
cgit v1.2.3-70-g09d2


From c4c1663be46b2ab94e59d3e0c583a8f6b188ff0c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:34:20 +1000
Subject: md/raid5: replace sh->lock with an 'active' flag.

sh->lock is now mainly used to ensure that two threads aren't running
in the locked part of handle_stripe[56] at the same time.

That can more neatly be achieved with an 'active' flag which we set
while running handle_stripe.  If we find the flag is set, we simply
requeue the stripe for later by setting STRIPE_HANDLE.

For safety we take ->device_lock while examining the state of the
stripe and creating a summary in 'stripe_head_state / r6_state'.
This possibly isn't needed but as shared fields like ->toread,
->towrite are checked it is safer for now at least.

We leave the label after the old 'unlock' called "unlock" because it
will disappear in a few patches, so renaming seems pointless.

This leaves the stripe 'locked' for longer as we clear STRIPE_ACTIVE
later, but that is not a problem.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 26 +++++++++++++-------------
 drivers/md/raid5.h | 35 ++++++++++++++++-------------------
 2 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9985138f4c0..f8275b5a6fb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1020,14 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
 			struct bio *wbi;
 
-			spin_lock(&sh->lock);
 			spin_lock_irq(&sh->raid_conf->device_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->raid_conf->device_lock);
-			spin_unlock(&sh->lock);
 
 			while (wbi && wbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
@@ -1322,7 +1320,6 @@ static int grow_one_stripe(raid5_conf_t *conf)
 		return 0;
 
 	sh->raid_conf = conf;
-	spin_lock_init(&sh->lock);
 	#ifdef CONFIG_MULTICORE_RAID456
 	init_waitqueue_head(&sh->ops.wait_for_ops);
 	#endif
@@ -1442,7 +1439,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 			break;
 
 		nsh->raid_conf = conf;
-		spin_lock_init(&nsh->lock);
 		#ifdef CONFIG_MULTICORE_RAID456
 		init_waitqueue_head(&nsh->ops.wait_for_ops);
 		#endif
@@ -2148,7 +2144,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		(unsigned long long)sh->sector);
 
 
-	spin_lock(&sh->lock);
 	spin_lock_irq(&conf->device_lock);
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
@@ -2184,7 +2179,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
 	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sh->lock);
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
 		(unsigned long long)(*bip)->bi_sector,
@@ -2201,7 +2195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
  overlap:
 	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
 	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sh->lock);
 	return 0;
 }
 
@@ -3023,12 +3016,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
 		 sh->reconstruct_state);
 
-	spin_lock(&sh->lock);
 	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		set_bit(STRIPE_SYNCING, &sh->state);
 		clear_bit(STRIPE_INSYNC, &sh->state);
 	}
-	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3037,6 +3028,7 @@ static void handle_stripe5(struct stripe_head *sh)
 
 	/* Now to look around and see what can be done */
 	rcu_read_lock();
+	spin_lock_irq(&conf->device_lock);
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 
@@ -3099,6 +3091,7 @@ static void handle_stripe5(struct stripe_head *sh)
 			s.failed_num = i;
 		}
 	}
+	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
 	if (unlikely(blocked_rdev)) {
@@ -3275,7 +3268,6 @@ static void handle_stripe5(struct stripe_head *sh)
 		handle_stripe_expansion(conf, sh, NULL);
 
  unlock:
-	spin_unlock(&sh->lock);
 
 	/* wait for this device to become unblocked */
 	if (unlikely(blocked_rdev))
@@ -3318,12 +3310,10 @@ static void handle_stripe6(struct stripe_head *sh)
 	       sh->check_state, sh->reconstruct_state);
 	memset(&s, 0, sizeof(s));
 
-	spin_lock(&sh->lock);
 	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		set_bit(STRIPE_SYNCING, &sh->state);
 		clear_bit(STRIPE_INSYNC, &sh->state);
 	}
-	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3332,6 +3322,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
+	spin_lock_irq(&conf->device_lock);
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 		dev = &sh->dev[i];
@@ -3395,6 +3386,7 @@ static void handle_stripe6(struct stripe_head *sh)
 			s.failed++;
 		}
 	}
+	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
 	if (unlikely(blocked_rdev)) {
@@ -3580,7 +3572,6 @@ static void handle_stripe6(struct stripe_head *sh)
 		handle_stripe_expansion(conf, sh, &r6s);
 
  unlock:
-	spin_unlock(&sh->lock);
 
 	/* wait for this device to become unblocked */
 	if (unlikely(blocked_rdev))
@@ -3608,10 +3599,19 @@ static void handle_stripe6(struct stripe_head *sh)
 
 static void handle_stripe(struct stripe_head *sh)
 {
+	clear_bit(STRIPE_HANDLE, &sh->state);
+	if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
+		/* already being handled, ensure it gets handled
+		 * again when current action finishes */
+		set_bit(STRIPE_HANDLE, &sh->state);
+		return;
+	}
+
 	if (sh->raid_conf->level == 6)
 		handle_stripe6(sh);
 	else
 		handle_stripe5(sh);
+	clear_bit(STRIPE_ACTIVE, &sh->state);
 }
 
 static void raid5_activate_delayed(raid5_conf_t *conf)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a33001137bf..bb246d9e054 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -6,11 +6,11 @@
 
 /*
  *
- * Each stripe contains one buffer per disc.  Each buffer can be in
+ * Each stripe contains one buffer per device.  Each buffer can be in
  * one of a number of states stored in "flags".  Changes between
- * these states happen *almost* exclusively under a per-stripe
- * spinlock.  Some very specific changes can happen in bi_end_io, and
- * these are not protected by the spin lock.
+ * these states happen *almost* exclusively under the protection of the
+ * STRIPE_ACTIVE flag.  Some very specific changes can happen in bi_end_io, and
+ * these are not protected by STRIPE_ACTIVE.
  *
  * The flag bits that are used to represent these states are:
  *   R5_UPTODATE and R5_LOCKED
@@ -76,12 +76,10 @@
  * block and the cached buffer are successfully written, any buffer on
  * a written list can be returned with b_end_io.
  *
- * The write list and read list both act as fifos.  The read list is
- * protected by the device_lock.  The write and written lists are
- * protected by the stripe lock.  The device_lock, which can be
- * claimed while the stipe lock is held, is only for list
- * manipulations and will only be held for a very short time.  It can
- * be claimed from interrupts.
+ * The write list and read list both act as fifos.  The read list,
+ * write list and written list are protected by the device_lock.
+ * The device_lock is only for list manipulations and will only be
+ * held for a very short time.  It can be claimed from interrupts.
  *
  *
  * Stripes in the stripe cache can be on one of two lists (or on
@@ -96,7 +94,6 @@
  *
  * The inactive_list, handle_list and hash bucket lists are all protected by the
  * device_lock.
- *  - stripes on the inactive_list never have their stripe_lock held.
  *  - stripes have a reference counter. If count==0, they are on a list.
  *  - If a stripe might need handling, STRIPE_HANDLE is set.
  *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
@@ -116,10 +113,10 @@
  *  attach a request to an active stripe (add_stripe_bh())
  *     lockdev attach-buffer unlockdev
  *  handle a stripe (handle_stripe())
- *     lockstripe clrSTRIPE_HANDLE ...
+ *     setSTRIPE_ACTIVE,  clrSTRIPE_HANDLE ...
  *		(lockdev check-buffers unlockdev) ..
  *		change-state ..
- *		record io/ops needed unlockstripe schedule io/ops
+ *		record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
  *  release an active stripe (release_stripe())
  *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
  *
@@ -128,8 +125,7 @@
  * on a cached buffer, and plus one if the stripe is undergoing stripe
  * operations.
  *
- * Stripe operations are performed outside the stripe lock,
- * the stripe operations are:
+ * The stripe operations are:
  * -copying data between the stripe cache and user application buffers
  * -computing blocks to save a disk access, or to recover a missing block
  * -updating the parity on a write operation (reconstruct write and
@@ -159,7 +155,8 @@
  */
 
 /*
- * Operations state - intermediate states that are visible outside of sh->lock
+ * Operations state - intermediate states that are visible outside of 
+ *   STRIPE_ACTIVE.
  * In general _idle indicates nothing is running, _run indicates a data
  * processing operation is active, and _result means the data processing result
  * is stable and can be acted upon.  For simple operations like biofill and
@@ -209,7 +206,6 @@ struct stripe_head {
 	short			ddf_layout;/* use DDF ordering to calculate Q */
 	unsigned long		state;		/* state flags */
 	atomic_t		count;	      /* nr of active thread/requests */
-	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;		/* disks in stripe */
 	enum check_states	check_state;
@@ -240,7 +236,7 @@ struct stripe_head {
 };
 
 /* stripe_head_state - collects and tracks the dynamic state of a stripe_head
- *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ *     for handle_stripe.
  */
 struct stripe_head_state {
 	int syncing, expanding, expanded;
@@ -290,6 +286,7 @@ struct r6_state {
  * Stripe state
  */
 enum {
+	STRIPE_ACTIVE,
 	STRIPE_HANDLE,
 	STRIPE_SYNC_REQUESTED,
 	STRIPE_SYNCING,
@@ -339,7 +336,7 @@ enum {
  * PREREAD_ACTIVE.
  * In stripe_handle, if we find pre-reading is necessary, we do it if
  * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
- * HANDLE gets cleared if stripe_handle leave nothing locked.
+ * HANDLE gets cleared if stripe_handle leaves nothing locked.
  */
 
 
-- 
cgit v1.2.3-70-g09d2


From 82e5a1718b9d0401b826341b9023766d04cb82f2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:35:15 +1000
Subject: md/raid5: move common code into handle_stripe

There is common code at the start of handle_stripe5 and
handle_stripe6.  Move it into handle_stripe.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8275b5a6fb..dfb3d9f80a3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3016,12 +3016,6 @@ static void handle_stripe5(struct stripe_head *sh)
 		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
 		 sh->reconstruct_state);
 
-	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
-		set_bit(STRIPE_SYNCING, &sh->state);
-		clear_bit(STRIPE_INSYNC, &sh->state);
-	}
-	clear_bit(STRIPE_DELAYED, &sh->state);
-
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -3310,12 +3304,6 @@ static void handle_stripe6(struct stripe_head *sh)
 	       sh->check_state, sh->reconstruct_state);
 	memset(&s, 0, sizeof(s));
 
-	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
-		set_bit(STRIPE_SYNCING, &sh->state);
-		clear_bit(STRIPE_INSYNC, &sh->state);
-	}
-	clear_bit(STRIPE_DELAYED, &sh->state);
-
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -3607,6 +3595,12 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
+	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+		set_bit(STRIPE_SYNCING, &sh->state);
+		clear_bit(STRIPE_INSYNC, &sh->state);
+	}
+	clear_bit(STRIPE_DELAYED, &sh->state);
+
 	if (sh->raid_conf->level == 6)
 		handle_stripe6(sh);
 	else
-- 
cgit v1.2.3-70-g09d2


From f2b3b44deee1524ca4f006048e0569f47eefdb74 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:35:19 +1000
Subject: md/raid5: unify stripe_head_state and r6_state

'struct stripe_head_state' stores state about the 'current' stripe
that is passed around while handling the stripe.
For RAID6 there is an extension structure: r6_state, which is also
passed around.
There is no value in keeping these separate, so move the fields from
the latter into the former.

This means that all code now needs to treat s->failed_num as an small
array, but this is a small cost.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 77 +++++++++++++++++++++++++++---------------------------
 drivers/md/raid5.h |  8 ++----
 2 files changed, 40 insertions(+), 45 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dfb3d9f80a3..bbc7792f013 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2318,7 +2318,7 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
 			int disk_idx, int disks)
 {
 	struct r5dev *dev = &sh->dev[disk_idx];
-	struct r5dev *failed_dev = &sh->dev[s->failed_num];
+	struct r5dev *failed_dev = &sh->dev[s->failed_num[0]];
 
 	/* is the data in this block needed, and can we get it? */
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
@@ -2334,7 +2334,7 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
 		 * otherwise read it if the backing disk is insync
 		 */
 		if ((s->uptodate == disks - 1) &&
-		    (s->failed && disk_idx == s->failed_num)) {
+		    (s->failed && disk_idx == s->failed_num[0])) {
 			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
 			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
 			set_bit(R5_Wantcompute, &dev->flags);
@@ -2388,11 +2388,11 @@ static void handle_stripe_fill5(struct stripe_head *sh,
  * 0 to tell the loop in handle_stripe_fill6 to continue
  */
 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
-			 struct r6_state *r6s, int disk_idx, int disks)
+			int disk_idx, int disks)
 {
 	struct r5dev *dev = &sh->dev[disk_idx];
-	struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
-				  &sh->dev[r6s->failed_num[1]] };
+	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
+				  &sh->dev[s->failed_num[1]] };
 
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
 	    !test_bit(R5_UPTODATE, &dev->flags) &&
@@ -2409,8 +2409,8 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
 		if ((s->uptodate == disks - 1) &&
-		    (s->failed && (disk_idx == r6s->failed_num[0] ||
-				   disk_idx == r6s->failed_num[1]))) {
+		    (s->failed && (disk_idx == s->failed_num[0] ||
+				   disk_idx == s->failed_num[1]))) {
 			/* have disk failed, and we're requested to fetch it;
 			 * do compute it
 			 */
@@ -2465,7 +2465,7 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
  * handle_stripe_fill6 - read or compute data to satisfy pending requests.
  */
 static void handle_stripe_fill6(struct stripe_head *sh,
-			struct stripe_head_state *s, struct r6_state *r6s,
+			struct stripe_head_state *s,
 			int disks)
 {
 	int i;
@@ -2477,7 +2477,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
 	    !sh->reconstruct_state)
 		for (i = disks; i--; )
-			if (fetch_block6(sh, s, r6s, i, disks))
+			if (fetch_block6(sh, s, i, disks))
 				break;
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
@@ -2625,7 +2625,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
 
 static void handle_stripe_dirtying6(raid5_conf_t *conf,
 		struct stripe_head *sh,	struct stripe_head_state *s,
-		struct r6_state *r6s, int disks)
+		int disks)
 {
 	int rcw = 0, pd_idx = sh->pd_idx, i;
 	int qd_idx = sh->qd_idx;
@@ -2688,7 +2688,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 			s->uptodate--;
 			break;
 		}
-		dev = &sh->dev[s->failed_num];
+		dev = &sh->dev[s->failed_num[0]];
 		/* fall through */
 	case check_state_compute_result:
 		sh->check_state = check_state_idle;
@@ -2760,7 +2760,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 
 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 				  struct stripe_head_state *s,
-				  struct r6_state *r6s, int disks)
+				  int disks)
 {
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
@@ -2779,14 +2779,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 	switch (sh->check_state) {
 	case check_state_idle:
 		/* start a new check operation if there are < 2 failures */
-		if (s->failed == r6s->q_failed) {
+		if (s->failed == s->q_failed) {
 			/* The only possible failed device holds Q, so it
 			 * makes sense to check P (If anything else were failed,
 			 * we would have used P to recreate it).
 			 */
 			sh->check_state = check_state_run;
 		}
-		if (!r6s->q_failed && s->failed < 2) {
+		if (!s->q_failed && s->failed < 2) {
 			/* Q is not failed, and we didn't use it to generate
 			 * anything, so it makes sense to check it
 			 */
@@ -2828,13 +2828,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 		 */
 		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
 		if (s->failed == 2) {
-			dev = &sh->dev[r6s->failed_num[1]];
+			dev = &sh->dev[s->failed_num[1]];
 			s->locked++;
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
 		}
 		if (s->failed >= 1) {
-			dev = &sh->dev[r6s->failed_num[0]];
+			dev = &sh->dev[s->failed_num[0]];
 			s->locked++;
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
@@ -2922,7 +2922,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 }
 
 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
-				struct r6_state *r6s)
+				    struct stripe_head_state *r6s)
 {
 	int i;
 
@@ -3082,7 +3082,7 @@ static void handle_stripe5(struct stripe_head *sh)
 			clear_bit(R5_Insync, &dev->flags);
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			s.failed++;
-			s.failed_num = i;
+			s.failed_num[0] = i;
 		}
 	}
 	spin_unlock_irq(&conf->device_lock);
@@ -3107,7 +3107,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
 		s.locked, s.uptodate, s.to_read, s.to_write,
-		s.failed, s.failed_num);
+		s.failed, s.failed_num[0]);
 	/* check if the array has lost two devices and, if so, some requests might
 	 * need to be failed
 	 */
@@ -3127,7 +3127,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	     ((test_bit(R5_Insync, &dev->flags) &&
 	       !test_bit(R5_LOCKED, &dev->flags) &&
 	       test_bit(R5_UPTODATE, &dev->flags)) ||
-	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
+	       (s.failed == 1 && s.failed_num[0] == sh->pd_idx)))
 		handle_stripe_clean_event(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -3198,11 +3198,11 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * the repair/check process
 	 */
 	if (s.failed == 1 && !conf->mddev->ro &&
-	    test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
-	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
-	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
+	    test_bit(R5_ReadError, &sh->dev[s.failed_num[0]].flags)
+	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num[0]].flags)
+	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num[0]].flags)
 		) {
-		dev = &sh->dev[s.failed_num];
+		dev = &sh->dev[s.failed_num[0]];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
@@ -3292,7 +3292,6 @@ static void handle_stripe6(struct stripe_head *sh)
 	struct bio *return_bi = NULL;
 	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
 	struct stripe_head_state s;
-	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int dec_preread_active = 0;
@@ -3370,7 +3369,7 @@ static void handle_stripe6(struct stripe_head *sh)
 			clear_bit(R5_Insync, &dev->flags);
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			if (s.failed < 2)
-				r6s.failed_num[s.failed] = i;
+				s.failed_num[s.failed] = i;
 			s.failed++;
 		}
 	}
@@ -3396,7 +3395,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
-	       r6s.failed_num[0], r6s.failed_num[1]);
+	       s.failed_num[0], s.failed_num[1]);
 	/* check if the array has lost >2 devices and, if so, some requests
 	 * might need to be failed
 	 */
@@ -3413,17 +3412,17 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * are safe, or on a failed drive
 	 */
 	pdev = &sh->dev[pd_idx];
-	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
-		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
+	s.p_failed = (s.failed >= 1 && s.failed_num[0] == pd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == pd_idx);
 	qdev = &sh->dev[qd_idx];
-	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
-		|| (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
+	s.q_failed = (s.failed >= 1 && s.failed_num[0] == qd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == qd_idx);
 
-	if ( s.written &&
-	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+	if (s.written &&
+	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
 			     && !test_bit(R5_LOCKED, &pdev->flags)
 			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
-	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && test_bit(R5_UPTODATE, &qdev->flags)))))
 		handle_stripe_clean_event(conf, sh, disks, &return_bi);
@@ -3434,7 +3433,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	 */
 	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
 	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
-		handle_stripe_fill6(sh, &s, &r6s, disks);
+		handle_stripe_fill6(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -3472,7 +3471,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	 *    block.
 	 */
 	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
+		handle_stripe_dirtying6(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -3483,7 +3482,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	    (s.syncing && s.locked == 0 &&
 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
-		handle_parity_checks6(conf, sh, &s, &r6s, disks);
+		handle_parity_checks6(conf, sh, &s, disks);
 
 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3495,7 +3494,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	 */
 	if (s.failed <= 2 && !conf->mddev->ro)
 		for (i = 0; i < s.failed; i++) {
-			dev = &sh->dev[r6s.failed_num[i]];
+			dev = &sh->dev[s.failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
 			    && !test_bit(R5_LOCKED, &dev->flags)
 			    && test_bit(R5_UPTODATE, &dev->flags)
@@ -3557,7 +3556,7 @@ static void handle_stripe6(struct stripe_head *sh)
 
 	if (s.expanding && s.locked == 0 &&
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
-		handle_stripe_expansion(conf, sh, &r6s);
+		handle_stripe_expansion(conf, sh, &s);
 
  unlock:
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index bb246d9e054..05ac5cde370 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -242,13 +242,9 @@ struct stripe_head_state {
 	int syncing, expanding, expanded;
 	int locked, uptodate, to_read, to_write, failed, written;
 	int to_fill, compute, req_compute, non_overwrite;
-	int failed_num;
+	int failed_num[2];
 	unsigned long ops_request;
-};
-
-/* r6_state - extra state data only relevant to r6 */
-struct r6_state {
-	int p_failed, q_failed, failed_num[2];
+	int p_failed, q_failed;
 };
 
 /* Flags */
-- 
cgit v1.2.3-70-g09d2


From c5709ef6a094c72b56355590bfa55cc107e98376 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:35:20 +1000
Subject: md/raid5:  add some more fields to stripe_head_state

Adding these three fields will allow more common code to be moved
to handle_stripe()

struct field rearrangement by Namhyung Kim.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 54 ++++++++++++++++++++++++------------------------------
 drivers/md/raid5.h |  6 +++++-
 2 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bbc7792f013..bc15f48be78 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3003,12 +3003,9 @@ static void handle_stripe5(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
-	struct bio *return_bi = NULL;
 	struct stripe_head_state s;
 	struct r5dev *dev;
-	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
-	int dec_preread_active = 0;
 
 	memset(&s, 0, sizeof(s));
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3058,9 +3055,9 @@ static void handle_stripe5(struct stripe_head *sh)
 		if (dev->written)
 			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (blocked_rdev == NULL &&
+		if (s.blocked_rdev == NULL &&
 		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			blocked_rdev = rdev;
+			s.blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
 		clear_bit(R5_Insync, &dev->flags);
@@ -3088,15 +3085,15 @@ static void handle_stripe5(struct stripe_head *sh)
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
-	if (unlikely(blocked_rdev)) {
+	if (unlikely(s.blocked_rdev)) {
 		if (s.syncing || s.expanding || s.expanded ||
 		    s.to_write || s.written) {
 			set_bit(STRIPE_HANDLE, &sh->state);
 			goto unlock;
 		}
 		/* There is nothing for the blocked_rdev to block */
-		rdev_dec_pending(blocked_rdev, conf->mddev);
-		blocked_rdev = NULL;
+		rdev_dec_pending(s.blocked_rdev, conf->mddev);
+		s.blocked_rdev = NULL;
 	}
 
 	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3112,7 +3109,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * need to be failed
 	 */
 	if (s.failed > 1 && s.to_read+s.to_write+s.written)
-		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
+		handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
 	if (s.failed > 1 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3128,7 +3125,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	       !test_bit(R5_LOCKED, &dev->flags) &&
 	       test_bit(R5_UPTODATE, &dev->flags)) ||
 	       (s.failed == 1 && s.failed_num[0] == sh->pd_idx)))
-		handle_stripe_clean_event(conf, sh, disks, &return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -3166,7 +3163,7 @@ static void handle_stripe5(struct stripe_head *sh)
 			}
 		}
 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-			dec_preread_active = 1;
+			s.dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3264,15 +3261,15 @@ static void handle_stripe5(struct stripe_head *sh)
  unlock:
 
 	/* wait for this device to become unblocked */
-	if (unlikely(blocked_rdev))
-		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+	if (unlikely(s.blocked_rdev))
+		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
 
 	if (s.ops_request)
 		raid_run_ops(sh, s.ops_request);
 
 	ops_run_io(sh, &s);
 
-	if (dec_preread_active) {
+	if (s.dec_preread_active) {
 		/* We delay this until after ops_run_io so that if make_request
 		 * is waiting on a flush, it won't continue until the writes
 		 * have actually been submitted.
@@ -3282,19 +3279,16 @@ static void handle_stripe5(struct stripe_head *sh)
 		    IO_THRESHOLD)
 			md_wakeup_thread(conf->mddev->thread);
 	}
-	return_io(return_bi);
+	return_io(s.return_bi);
 }
 
 static void handle_stripe6(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
-	struct bio *return_bi = NULL;
 	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
 	struct stripe_head_state s;
 	struct r5dev *dev, *pdev, *qdev;
-	mdk_rdev_t *blocked_rdev = NULL;
-	int dec_preread_active = 0;
 
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
 		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3345,9 +3339,9 @@ static void handle_stripe6(struct stripe_head *sh)
 		if (dev->written)
 			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (blocked_rdev == NULL &&
+		if (s.blocked_rdev == NULL &&
 		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			blocked_rdev = rdev;
+			s.blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
 		clear_bit(R5_Insync, &dev->flags);
@@ -3376,15 +3370,15 @@ static void handle_stripe6(struct stripe_head *sh)
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
-	if (unlikely(blocked_rdev)) {
+	if (unlikely(s.blocked_rdev)) {
 		if (s.syncing || s.expanding || s.expanded ||
 		    s.to_write || s.written) {
 			set_bit(STRIPE_HANDLE, &sh->state);
 			goto unlock;
 		}
 		/* There is nothing for the blocked_rdev to block */
-		rdev_dec_pending(blocked_rdev, conf->mddev);
-		blocked_rdev = NULL;
+		rdev_dec_pending(s.blocked_rdev, conf->mddev);
+		s.blocked_rdev = NULL;
 	}
 
 	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3400,7 +3394,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * might need to be failed
 	 */
 	if (s.failed > 2 && s.to_read+s.to_write+s.written)
-		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
+		handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
 	if (s.failed > 2 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3425,7 +3419,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && test_bit(R5_UPTODATE, &qdev->flags)))))
-		handle_stripe_clean_event(conf, sh, disks, &return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -3461,7 +3455,7 @@ static void handle_stripe6(struct stripe_head *sh)
 			}
 		}
 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-			dec_preread_active = 1;
+			s.dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3561,8 +3555,8 @@ static void handle_stripe6(struct stripe_head *sh)
  unlock:
 
 	/* wait for this device to become unblocked */
-	if (unlikely(blocked_rdev))
-		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+	if (unlikely(s.blocked_rdev))
+		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
 
 	if (s.ops_request)
 		raid_run_ops(sh, s.ops_request);
@@ -3570,7 +3564,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	ops_run_io(sh, &s);
 
 
-	if (dec_preread_active) {
+	if (s.dec_preread_active) {
 		/* We delay this until after ops_run_io so that if make_request
 		 * is waiting on a flush, it won't continue until the writes
 		 * have actually been submitted.
@@ -3581,7 +3575,7 @@ static void handle_stripe6(struct stripe_head *sh)
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	return_io(return_bi);
+	return_io(s.return_bi);
 }
 
 static void handle_stripe(struct stripe_head *sh)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 05ac5cde370..68c500af110 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -243,8 +243,12 @@ struct stripe_head_state {
 	int locked, uptodate, to_read, to_write, failed, written;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
-	unsigned long ops_request;
 	int p_failed, q_failed;
+	int dec_preread_active;
+	unsigned long ops_request;
+
+	struct bio *return_bi;
+	mdk_rdev_t *blocked_rdev;
 };
 
 /* Flags */
-- 
cgit v1.2.3-70-g09d2


From cc94015a9eac5d511fe9b716624d8fdf9c6e64b2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 Jul 2011 11:35:35 +1000
Subject: md/raid5: move stripe_head_state and more code into handle_stripe.

By defining the 'stripe_head_state' in 'handle_stripe', we can move
some common code out of handle_stripe[56]() and into handle_stripe.

The means that all accesses for stripe_head_state in handle_stripe[56]
need to be 's->' instead of 's.', but the compiler should inline
those functions and just use a direct stack reference, and future
patches while hoist most of this code up into handle_stripe()
so we will revert to "s.".

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 340 +++++++++++++++++++++++++----------------------------
 1 file changed, 158 insertions(+), 182 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bc15f48be78..36873cfc329 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2999,24 +2999,13 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
  *
  */
 
-static void handle_stripe5(struct stripe_head *sh)
+static void handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
-	struct stripe_head_state s;
 	struct r5dev *dev;
 	int prexor;
 
-	memset(&s, 0, sizeof(s));
-	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
-		 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
-		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
-		 sh->reconstruct_state);
-
-	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-
 	/* Now to look around and see what can be done */
 	rcu_read_lock();
 	spin_lock_irq(&conf->device_lock);
@@ -3039,25 +3028,28 @@ static void handle_stripe5(struct stripe_head *sh)
 			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
-		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
+		if (test_bit(R5_LOCKED, &dev->flags))
+			s->locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags))
+			s->uptodate++;
+		if (test_bit(R5_Wantcompute, &dev->flags))
+			s->compute++;
 
 		if (test_bit(R5_Wantfill, &dev->flags))
-			s.to_fill++;
+			s->to_fill++;
 		else if (dev->toread)
-			s.to_read++;
+			s->to_read++;
 		if (dev->towrite) {
-			s.to_write++;
+			s->to_write++;
 			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				s.non_overwrite++;
+				s->non_overwrite++;
 		}
 		if (dev->written)
-			s.written++;
+			s->written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (s.blocked_rdev == NULL &&
+		if (s->blocked_rdev == NULL &&
 		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			s.blocked_rdev = rdev;
+			s->blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
 		clear_bit(R5_Insync, &dev->flags);
@@ -3078,62 +3070,62 @@ static void handle_stripe5(struct stripe_head *sh)
 		if (test_bit(R5_ReadError, &dev->flags))
 			clear_bit(R5_Insync, &dev->flags);
 		if (!test_bit(R5_Insync, &dev->flags)) {
-			s.failed++;
-			s.failed_num[0] = i;
+			s->failed++;
+			s->failed_num[0] = i;
 		}
 	}
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
-	if (unlikely(s.blocked_rdev)) {
-		if (s.syncing || s.expanding || s.expanded ||
-		    s.to_write || s.written) {
+	if (unlikely(s->blocked_rdev)) {
+		if (s->syncing || s->expanding || s->expanded ||
+		    s->to_write || s->written) {
 			set_bit(STRIPE_HANDLE, &sh->state);
-			goto unlock;
+			return;
 		}
 		/* There is nothing for the blocked_rdev to block */
-		rdev_dec_pending(s.blocked_rdev, conf->mddev);
-		s.blocked_rdev = NULL;
+		rdev_dec_pending(s->blocked_rdev, conf->mddev);
+		s->blocked_rdev = NULL;
 	}
 
-	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
-		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+	if (s->to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+		set_bit(STRIPE_OP_BIOFILL, &s->ops_request);
 		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
 	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
-		s.locked, s.uptodate, s.to_read, s.to_write,
-		s.failed, s.failed_num[0]);
+		s->locked, s->uptodate, s->to_read, s->to_write,
+		s->failed, s->failed_num[0]);
 	/* check if the array has lost two devices and, if so, some requests might
 	 * need to be failed
 	 */
-	if (s.failed > 1 && s.to_read+s.to_write+s.written)
-		handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-	if (s.failed > 1 && s.syncing) {
+	if (s->failed > 1 && s->to_read+s->to_write+s->written)
+		handle_failed_stripe(conf, sh, s, disks, &s->return_bi);
+	if (s->failed > 1 && s->syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
-		s.syncing = 0;
+		s->syncing = 0;
 	}
 
 	/* might be able to return some write requests if the parity block
 	 * is safe, or on a failed drive
 	 */
 	dev = &sh->dev[sh->pd_idx];
-	if ( s.written &&
-	     ((test_bit(R5_Insync, &dev->flags) &&
-	       !test_bit(R5_LOCKED, &dev->flags) &&
-	       test_bit(R5_UPTODATE, &dev->flags)) ||
-	       (s.failed == 1 && s.failed_num[0] == sh->pd_idx)))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+	if (s->written &&
+	    ((test_bit(R5_Insync, &dev->flags) &&
+	      !test_bit(R5_LOCKED, &dev->flags) &&
+	      test_bit(R5_UPTODATE, &dev->flags)) ||
+	     (s->failed == 1 && s->failed_num[0] == sh->pd_idx)))
+		handle_stripe_clean_event(conf, sh, disks, &s->return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (s.to_read || s.non_overwrite ||
-	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
-		handle_stripe_fill5(sh, &s, disks);
+	if (s->to_read || s->non_overwrite ||
+	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
+		handle_stripe_fill5(sh, s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -3158,12 +3150,12 @@ static void handle_stripe5(struct stripe_head *sh)
 				if (prexor)
 					continue;
 				if (!test_bit(R5_Insync, &dev->flags) ||
-				    (i == sh->pd_idx && s.failed == 0))
+				    (i == sh->pd_idx && s->failed == 0))
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-			s.dec_preread_active = 1;
+			s->dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3172,8 +3164,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
 	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying5(conf, sh, &s, disks);
+	if (s->to_write && !sh->reconstruct_state && !sh->check_state)
+		handle_stripe_dirtying5(conf, sh, s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -3181,12 +3173,13 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * dependent operations are in flight.
 	 */
 	if (sh->check_state ||
-	    (s.syncing && s.locked == 0 &&
+	    (s->syncing && s->locked == 0 &&
 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
-		handle_parity_checks5(conf, sh, &s, disks);
+		handle_parity_checks5(conf, sh, s, disks);
 
-	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+	if (s->syncing && s->locked == 0
+	    && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
@@ -3194,22 +3187,22 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* If the failed drive is just a ReadError, then we might need to progress
 	 * the repair/check process
 	 */
-	if (s.failed == 1 && !conf->mddev->ro &&
-	    test_bit(R5_ReadError, &sh->dev[s.failed_num[0]].flags)
-	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num[0]].flags)
-	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num[0]].flags)
+	if (s->failed == 1 && !conf->mddev->ro &&
+	    test_bit(R5_ReadError, &sh->dev[s->failed_num[0]].flags)
+	    && !test_bit(R5_LOCKED, &sh->dev[s->failed_num[0]].flags)
+	    && test_bit(R5_UPTODATE, &sh->dev[s->failed_num[0]].flags)
 		) {
-		dev = &sh->dev[s.failed_num[0]];
+		dev = &sh->dev[s->failed_num[0]];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
-			s.locked++;
+			s->locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
-			s.locked++;
+			s->locked++;
 		}
 	}
 
@@ -3227,7 +3220,7 @@ static void handle_stripe5(struct stripe_head *sh)
 					      &sh2->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe(sh2);
-			goto unlock;
+			return;
 		}
 		if (sh2)
 			release_stripe(sh2);
@@ -3237,69 +3230,35 @@ static void handle_stripe5(struct stripe_head *sh)
 		for (i = conf->raid_disks; i--; ) {
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			s.locked++;
+			s->locked++;
 		}
 	}
 
-	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+	if (s->expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
 	    !sh->reconstruct_state) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		stripe_set_idx(sh->sector, conf, 0, sh);
-		schedule_reconstruction(sh, &s, 1, 1);
-	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
+		schedule_reconstruction(sh, s, 1, 1);
+	} else if (s->expanded && !sh->reconstruct_state && s->locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
 
-	if (s.expanding && s.locked == 0 &&
+	if (s->expanding && s->locked == 0 &&
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, NULL);
-
- unlock:
-
-	/* wait for this device to become unblocked */
-	if (unlikely(s.blocked_rdev))
-		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
-
-	if (s.ops_request)
-		raid_run_ops(sh, s.ops_request);
-
-	ops_run_io(sh, &s);
-
-	if (s.dec_preread_active) {
-		/* We delay this until after ops_run_io so that if make_request
-		 * is waiting on a flush, it won't continue until the writes
-		 * have actually been submitted.
-		 */
-		atomic_dec(&conf->preread_active_stripes);
-		if (atomic_read(&conf->preread_active_stripes) <
-		    IO_THRESHOLD)
-			md_wakeup_thread(conf->mddev->thread);
-	}
-	return_io(s.return_bi);
 }
 
-static void handle_stripe6(struct stripe_head *sh)
+static void handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
 	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
-	struct stripe_head_state s;
 	struct r5dev *dev, *pdev, *qdev;
 
-	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
-		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
-	       (unsigned long long)sh->sector, sh->state,
-	       atomic_read(&sh->count), pd_idx, qd_idx,
-	       sh->check_state, sh->reconstruct_state);
-	memset(&s, 0, sizeof(s));
-
-	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -3320,28 +3279,30 @@ static void handle_stripe6(struct stripe_head *sh)
 			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
+		if (test_bit(R5_LOCKED, &dev->flags))
+			s->locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags))
+			s->uptodate++;
 		if (test_bit(R5_Wantcompute, &dev->flags)) {
-			s.compute++;
-			BUG_ON(s.compute > 2);
+			s->compute++;
+			BUG_ON(s->compute > 2);
 		}
 
 		if (test_bit(R5_Wantfill, &dev->flags)) {
-			s.to_fill++;
+			s->to_fill++;
 		} else if (dev->toread)
-			s.to_read++;
+			s->to_read++;
 		if (dev->towrite) {
-			s.to_write++;
+			s->to_write++;
 			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				s.non_overwrite++;
+				s->non_overwrite++;
 		}
 		if (dev->written)
-			s.written++;
+			s->written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (s.blocked_rdev == NULL &&
+		if (s->blocked_rdev == NULL &&
 		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			s.blocked_rdev = rdev;
+			s->blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
 		clear_bit(R5_Insync, &dev->flags);
@@ -3362,43 +3323,43 @@ static void handle_stripe6(struct stripe_head *sh)
 		if (test_bit(R5_ReadError, &dev->flags))
 			clear_bit(R5_Insync, &dev->flags);
 		if (!test_bit(R5_Insync, &dev->flags)) {
-			if (s.failed < 2)
-				s.failed_num[s.failed] = i;
-			s.failed++;
+			if (s->failed < 2)
+				s->failed_num[s->failed] = i;
+			s->failed++;
 		}
 	}
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
-	if (unlikely(s.blocked_rdev)) {
-		if (s.syncing || s.expanding || s.expanded ||
-		    s.to_write || s.written) {
+	if (unlikely(s->blocked_rdev)) {
+		if (s->syncing || s->expanding || s->expanded ||
+		    s->to_write || s->written) {
 			set_bit(STRIPE_HANDLE, &sh->state);
-			goto unlock;
+			return;
 		}
 		/* There is nothing for the blocked_rdev to block */
-		rdev_dec_pending(s.blocked_rdev, conf->mddev);
-		s.blocked_rdev = NULL;
+		rdev_dec_pending(s->blocked_rdev, conf->mddev);
+		s->blocked_rdev = NULL;
 	}
 
-	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
-		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+	if (s->to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+		set_bit(STRIPE_OP_BIOFILL, &s->ops_request);
 		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
 	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
-	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
-	       s.failed_num[0], s.failed_num[1]);
+	       s->locked, s->uptodate, s->to_read, s->to_write, s->failed,
+	       s->failed_num[0], s->failed_num[1]);
 	/* check if the array has lost >2 devices and, if so, some requests
 	 * might need to be failed
 	 */
-	if (s.failed > 2 && s.to_read+s.to_write+s.written)
-		handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-	if (s.failed > 2 && s.syncing) {
+	if (s->failed > 2 && s->to_read+s->to_write+s->written)
+		handle_failed_stripe(conf, sh, s, disks, &s->return_bi);
+	if (s->failed > 2 && s->syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
-		s.syncing = 0;
+		s->syncing = 0;
 	}
 
 	/*
@@ -3406,28 +3367,28 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * are safe, or on a failed drive
 	 */
 	pdev = &sh->dev[pd_idx];
-	s.p_failed = (s.failed >= 1 && s.failed_num[0] == pd_idx)
-		|| (s.failed >= 2 && s.failed_num[1] == pd_idx);
+	s->p_failed = (s->failed >= 1 && s->failed_num[0] == pd_idx)
+		|| (s->failed >= 2 && s->failed_num[1] == pd_idx);
 	qdev = &sh->dev[qd_idx];
-	s.q_failed = (s.failed >= 1 && s.failed_num[0] == qd_idx)
-		|| (s.failed >= 2 && s.failed_num[1] == qd_idx);
+	s->q_failed = (s->failed >= 1 && s->failed_num[0] == qd_idx)
+		|| (s->failed >= 2 && s->failed_num[1] == qd_idx);
 
-	if (s.written &&
-	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+	if (s->written &&
+	    (s->p_failed || ((test_bit(R5_Insync, &pdev->flags)
 			     && !test_bit(R5_LOCKED, &pdev->flags)
 			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
-	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+	    (s->q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && test_bit(R5_UPTODATE, &qdev->flags)))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &s->return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
-	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
-		handle_stripe_fill6(sh, &s, disks);
+	if (s->to_read || s->non_overwrite || (s->to_write && s->failed) ||
+	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
+		handle_stripe_fill6(sh, s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -3450,12 +3411,12 @@ static void handle_stripe6(struct stripe_head *sh)
 				set_bit(R5_Wantwrite, &dev->flags);
 				if (!test_bit(R5_Insync, &dev->flags) ||
 				    ((i == sh->pd_idx || i == qd_idx) &&
-				      s.failed == 0))
+				      s->failed == 0))
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-			s.dec_preread_active = 1;
+			s->dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3464,8 +3425,8 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
 	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying6(conf, sh, &s, disks);
+	if (s->to_write && !sh->reconstruct_state && !sh->check_state)
+		handle_stripe_dirtying6(conf, sh, s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -3473,12 +3434,13 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * dependent operations are in flight.
 	 */
 	if (sh->check_state ||
-	    (s.syncing && s.locked == 0 &&
+	    (s->syncing && s->locked == 0 &&
 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
-		handle_parity_checks6(conf, sh, &s, disks);
+		handle_parity_checks6(conf, sh, s, disks);
 
-	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+	if (s->syncing && s->locked == 0
+	    && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
@@ -3486,9 +3448,9 @@ static void handle_stripe6(struct stripe_head *sh)
 	/* If the failed drives are just a ReadError, then we might need
 	 * to progress the repair/check process
 	 */
-	if (s.failed <= 2 && !conf->mddev->ro)
-		for (i = 0; i < s.failed; i++) {
-			dev = &sh->dev[s.failed_num[i]];
+	if (s->failed <= 2 && !conf->mddev->ro)
+		for (i = 0; i < s->failed; i++) {
+			dev = &sh->dev[s->failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
 			    && !test_bit(R5_LOCKED, &dev->flags)
 			    && test_bit(R5_UPTODATE, &dev->flags)
@@ -3497,12 +3459,12 @@ static void handle_stripe6(struct stripe_head *sh)
 					set_bit(R5_Wantwrite, &dev->flags);
 					set_bit(R5_ReWrite, &dev->flags);
 					set_bit(R5_LOCKED, &dev->flags);
-					s.locked++;
+					s->locked++;
 				} else {
 					/* let's read it back */
 					set_bit(R5_Wantread, &dev->flags);
 					set_bit(R5_LOCKED, &dev->flags);
-					s.locked++;
+					s->locked++;
 				}
 			}
 		}
@@ -3514,11 +3476,11 @@ static void handle_stripe6(struct stripe_head *sh)
 		for (i = conf->raid_disks; i--; ) {
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			s.locked++;
+			s->locked++;
 		}
 	}
 
-	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+	if (s->expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
 	    !sh->reconstruct_state) {
 		struct stripe_head *sh2
 			= get_active_stripe(conf, sh->sector, 1, 1, 1);
@@ -3532,7 +3494,7 @@ static void handle_stripe6(struct stripe_head *sh)
 					      &sh2->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe(sh2);
-			goto unlock;
+			return;
 		}
 		if (sh2)
 			release_stripe(sh2);
@@ -3540,19 +3502,54 @@ static void handle_stripe6(struct stripe_head *sh)
 		/* Need to write out all blocks after computing P&Q */
 		sh->disks = conf->raid_disks;
 		stripe_set_idx(sh->sector, conf, 0, sh);
-		schedule_reconstruction(sh, &s, 1, 1);
-	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
+		schedule_reconstruction(sh, s, 1, 1);
+	} else if (s->expanded && !sh->reconstruct_state && s->locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
 
-	if (s.expanding && s.locked == 0 &&
+	if (s->expanding && s->locked == 0 &&
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
-		handle_stripe_expansion(conf, sh, &s);
+		handle_stripe_expansion(conf, sh, s);
+}
+
+static void handle_stripe(struct stripe_head *sh)
+{
+	struct stripe_head_state s;
+	raid5_conf_t *conf = sh->raid_conf;
+
+	clear_bit(STRIPE_HANDLE, &sh->state);
+	if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
+		/* already being handled, ensure it gets handled
+		 * again when current action finishes */
+		set_bit(STRIPE_HANDLE, &sh->state);
+		return;
+	}
+
+	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+		set_bit(STRIPE_SYNCING, &sh->state);
+		clear_bit(STRIPE_INSYNC, &sh->state);
+	}
+	clear_bit(STRIPE_DELAYED, &sh->state);
+
+	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
+		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
+	       (unsigned long long)sh->sector, sh->state,
+	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
+	       sh->check_state, sh->reconstruct_state);
+	memset(&s, 0, sizeof(s));
+
+	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+
+	if (conf->level == 6)
+		handle_stripe6(sh, &s);
+	else
+		handle_stripe5(sh, &s);
 
- unlock:
 
 	/* wait for this device to become unblocked */
 	if (unlikely(s.blocked_rdev))
@@ -3576,28 +3573,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	}
 
 	return_io(s.return_bi);
-}
 
-static void handle_stripe(struct stripe_head *sh)
-{
-	clear_bit(STRIPE_HANDLE, &sh->state);
-	if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
-		/* already being handled, ensure it gets handled
-		 * again when current action finishes */
-		set_bit(STRIPE_HANDLE, &sh->state);
-		return;
-	}
-
-	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
-		set_bit(STRIPE_SYNCING, &sh->state);
-		clear_bit(STRIPE_INSYNC, &sh->state);
-	}
-	clear_bit(STRIPE_DELAYED, &sh->state);
-
-	if (sh->raid_conf->level == 6)
-		handle_stripe6(sh);
-	else
-		handle_stripe5(sh);
 	clear_bit(STRIPE_ACTIVE, &sh->state);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 86c374ba9f6726a79a032ede741dc66d219b166e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: Remove stripe_head_state arg from handle_stripe_expansion.

This arg is only used to differentiate between RAID5 and RAID6 but
that is not needed.  For RAID5, raid5_compute_sector will set qd_idx
to "~0" so j with certainly not equals qd_idx, so there is no need
for a guard on that condition.

So remove the guard and remove the arg from the declaration and
callers of handle_stripe_expansion.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 36873cfc329..793dd76aeae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2921,8 +2921,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 	}
 }
 
-static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
-				    struct stripe_head_state *r6s)
+static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	int i;
 
@@ -2964,7 +2963,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
 			for (j = 0; j < conf->raid_disks; j++)
 				if (j != sh2->pd_idx &&
-				    (!r6s || j != sh2->qd_idx) &&
+				    j != sh2->qd_idx &&
 				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
 					break;
 			if (j == conf->raid_disks) {
@@ -3249,7 +3248,7 @@ static void handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 
 	if (s->expanding && s->locked == 0 &&
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
-		handle_stripe_expansion(conf, sh, NULL);
+		handle_stripe_expansion(conf, sh);
 }
 
 static void handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
@@ -3512,7 +3511,7 @@ static void handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 
 	if (s->expanding && s->locked == 0 &&
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
-		handle_stripe_expansion(conf, sh, s);
+		handle_stripe_expansion(conf, sh);
 }
 
 static void handle_stripe(struct stripe_head *sh)
-- 
cgit v1.2.3-70-g09d2


From 3687c061886dd0bfec07e131ad12f916ef0abc62 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: Move code for finishing a reconstruction into
 handle_stripe.

Prior to commit ab69ae12ceef7 the code in handle_stripe5 and
handle_stripe6 to "Finish reconstruct operations initiated by the
expansion process" was identical.
That commit added an identical stanza of code to each function, but in
different places.  That was careless.

The raid5 code was correct, so move that out into handle_stripe and
remove raid6 version.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 153 ++++++++++++++++++++---------------------------------
 1 file changed, 57 insertions(+), 96 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 793dd76aeae..cd6f04f145e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2998,7 +2998,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
  *
  */
 
-static void handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
+static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
@@ -3080,7 +3080,7 @@ static void handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 		if (s->syncing || s->expanding || s->expanded ||
 		    s->to_write || s->written) {
 			set_bit(STRIPE_HANDLE, &sh->state);
-			return;
+			return 1;
 		}
 		/* There is nothing for the blocked_rdev to block */
 		rdev_dec_pending(s->blocked_rdev, conf->mddev);
@@ -3204,54 +3204,10 @@ static void handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 			s->locked++;
 		}
 	}
-
-	/* Finish reconstruct operations initiated by the expansion process */
-	if (sh->reconstruct_state == reconstruct_state_result) {
-		struct stripe_head *sh2
-			= get_active_stripe(conf, sh->sector, 1, 1, 1);
-		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
-			/* sh cannot be written until sh2 has been read.
-			 * so arrange for sh to be delayed a little
-			 */
-			set_bit(STRIPE_DELAYED, &sh->state);
-			set_bit(STRIPE_HANDLE, &sh->state);
-			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
-					      &sh2->state))
-				atomic_inc(&conf->preread_active_stripes);
-			release_stripe(sh2);
-			return;
-		}
-		if (sh2)
-			release_stripe(sh2);
-
-		sh->reconstruct_state = reconstruct_state_idle;
-		clear_bit(STRIPE_EXPANDING, &sh->state);
-		for (i = conf->raid_disks; i--; ) {
-			set_bit(R5_Wantwrite, &sh->dev[i].flags);
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			s->locked++;
-		}
-	}
-
-	if (s->expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
-	    !sh->reconstruct_state) {
-		/* Need to write out all blocks after computing parity */
-		sh->disks = conf->raid_disks;
-		stripe_set_idx(sh->sector, conf, 0, sh);
-		schedule_reconstruction(sh, s, 1, 1);
-	} else if (s->expanded && !sh->reconstruct_state && s->locked == 0) {
-		clear_bit(STRIPE_EXPAND_READY, &sh->state);
-		atomic_dec(&conf->reshape_stripes);
-		wake_up(&conf->wait_for_overlap);
-		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
-	}
-
-	if (s->expanding && s->locked == 0 &&
-	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
-		handle_stripe_expansion(conf, sh);
+	return 0;
 }
 
-static void handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
+static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
@@ -3334,7 +3290,7 @@ static void handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 		if (s->syncing || s->expanding || s->expanded ||
 		    s->to_write || s->written) {
 			set_bit(STRIPE_HANDLE, &sh->state);
-			return;
+			return 1;
 		}
 		/* There is nothing for the blocked_rdev to block */
 		rdev_dec_pending(s->blocked_rdev, conf->mddev);
@@ -3467,56 +3423,14 @@ static void handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 				}
 			}
 		}
-
-	/* Finish reconstruct operations initiated by the expansion process */
-	if (sh->reconstruct_state == reconstruct_state_result) {
-		sh->reconstruct_state = reconstruct_state_idle;
-		clear_bit(STRIPE_EXPANDING, &sh->state);
-		for (i = conf->raid_disks; i--; ) {
-			set_bit(R5_Wantwrite, &sh->dev[i].flags);
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			s->locked++;
-		}
-	}
-
-	if (s->expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
-	    !sh->reconstruct_state) {
-		struct stripe_head *sh2
-			= get_active_stripe(conf, sh->sector, 1, 1, 1);
-		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
-			/* sh cannot be written until sh2 has been read.
-			 * so arrange for sh to be delayed a little
-			 */
-			set_bit(STRIPE_DELAYED, &sh->state);
-			set_bit(STRIPE_HANDLE, &sh->state);
-			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
-					      &sh2->state))
-				atomic_inc(&conf->preread_active_stripes);
-			release_stripe(sh2);
-			return;
-		}
-		if (sh2)
-			release_stripe(sh2);
-
-		/* Need to write out all blocks after computing P&Q */
-		sh->disks = conf->raid_disks;
-		stripe_set_idx(sh->sector, conf, 0, sh);
-		schedule_reconstruction(sh, s, 1, 1);
-	} else if (s->expanded && !sh->reconstruct_state && s->locked == 0) {
-		clear_bit(STRIPE_EXPAND_READY, &sh->state);
-		atomic_dec(&conf->reshape_stripes);
-		wake_up(&conf->wait_for_overlap);
-		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
-	}
-
-	if (s->expanding && s->locked == 0 &&
-	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
-		handle_stripe_expansion(conf, sh);
+	return 0;
 }
 
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
+	int done;
+	int i;
 	raid5_conf_t *conf = sh->raid_conf;
 
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -3545,11 +3459,58 @@ static void handle_stripe(struct stripe_head *sh)
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 
 	if (conf->level == 6)
-		handle_stripe6(sh, &s);
+		done = handle_stripe6(sh, &s);
 	else
-		handle_stripe5(sh, &s);
+		done = handle_stripe5(sh, &s);
+
+	if (done)
+		goto finish;
+	/* Finish reconstruct operations initiated by the expansion process */
+	if (sh->reconstruct_state == reconstruct_state_result) {
+		struct stripe_head *sh_src
+			= get_active_stripe(conf, sh->sector, 1, 1, 1);
+		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
+			/* sh cannot be written until sh_src has been read.
+			 * so arrange for sh to be delayed a little
+			 */
+			set_bit(STRIPE_DELAYED, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+					      &sh_src->state))
+				atomic_inc(&conf->preread_active_stripes);
+			release_stripe(sh_src);
+			goto finish;
+		}
+		if (sh_src)
+			release_stripe(sh_src);
+
+		sh->reconstruct_state = reconstruct_state_idle;
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+		for (i = conf->raid_disks; i--; ) {
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			s.locked++;
+		}
+	}
 
+	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+	    !sh->reconstruct_state) {
+		/* Need to write out all blocks after computing parity */
+		sh->disks = conf->raid_disks;
+		stripe_set_idx(sh->sector, conf, 0, sh);
+		schedule_reconstruction(sh, &s, 1, 1);
+	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
+		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		atomic_dec(&conf->reshape_stripes);
+		wake_up(&conf->wait_for_overlap);
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+	}
+
+	if (s.expanding && s.locked == 0 &&
+	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
+		handle_stripe_expansion(conf, sh);
 
+finish:
 	/* wait for this device to become unblocked */
 	if (unlikely(s.blocked_rdev))
 		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
-- 
cgit v1.2.3-70-g09d2


From c5a3100062cf277d3edd4e6f4a1f1e403524b464 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: move more code into common handle_stripe

The difference between the RAID5 and RAID6 code here is easily
resolved using conf->max_degraded.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 90 +++++++++++++++++++-----------------------------------
 1 file changed, 32 insertions(+), 58 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cd6f04f145e..74a575bbd25 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3176,34 +3176,6 @@ static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks5(conf, sh, s, disks);
-
-	if (s->syncing && s->locked == 0
-	    && test_bit(STRIPE_INSYNC, &sh->state)) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-	}
-
-	/* If the failed drive is just a ReadError, then we might need to progress
-	 * the repair/check process
-	 */
-	if (s->failed == 1 && !conf->mddev->ro &&
-	    test_bit(R5_ReadError, &sh->dev[s->failed_num[0]].flags)
-	    && !test_bit(R5_LOCKED, &sh->dev[s->failed_num[0]].flags)
-	    && test_bit(R5_UPTODATE, &sh->dev[s->failed_num[0]].flags)
-		) {
-		dev = &sh->dev[s->failed_num[0]];
-		if (!test_bit(R5_ReWrite, &dev->flags)) {
-			set_bit(R5_Wantwrite, &dev->flags);
-			set_bit(R5_ReWrite, &dev->flags);
-			set_bit(R5_LOCKED, &dev->flags);
-			s->locked++;
-		} else {
-			/* let's read it back */
-			set_bit(R5_Wantread, &dev->flags);
-			set_bit(R5_LOCKED, &dev->flags);
-			s->locked++;
-		}
-	}
 	return 0;
 }
 
@@ -3393,36 +3365,6 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks6(conf, sh, s, disks);
-
-	if (s->syncing && s->locked == 0
-	    && test_bit(STRIPE_INSYNC, &sh->state)) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-	}
-
-	/* If the failed drives are just a ReadError, then we might need
-	 * to progress the repair/check process
-	 */
-	if (s->failed <= 2 && !conf->mddev->ro)
-		for (i = 0; i < s->failed; i++) {
-			dev = &sh->dev[s->failed_num[i]];
-			if (test_bit(R5_ReadError, &dev->flags)
-			    && !test_bit(R5_LOCKED, &dev->flags)
-			    && test_bit(R5_UPTODATE, &dev->flags)
-				) {
-				if (!test_bit(R5_ReWrite, &dev->flags)) {
-					set_bit(R5_Wantwrite, &dev->flags);
-					set_bit(R5_ReWrite, &dev->flags);
-					set_bit(R5_LOCKED, &dev->flags);
-					s->locked++;
-				} else {
-					/* let's read it back */
-					set_bit(R5_Wantread, &dev->flags);
-					set_bit(R5_LOCKED, &dev->flags);
-					s->locked++;
-				}
-			}
-		}
 	return 0;
 }
 
@@ -3465,6 +3407,38 @@ static void handle_stripe(struct stripe_head *sh)
 
 	if (done)
 		goto finish;
+
+
+	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+	}
+
+	/* If the failed drives are just a ReadError, then we might need
+	 * to progress the repair/check process
+	 */
+	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+		for (i = 0; i < s.failed; i++) {
+			struct r5dev *dev = &sh->dev[s.failed_num[i]];
+			if (test_bit(R5_ReadError, &dev->flags)
+			    && !test_bit(R5_LOCKED, &dev->flags)
+			    && test_bit(R5_UPTODATE, &dev->flags)
+				) {
+				if (!test_bit(R5_ReWrite, &dev->flags)) {
+					set_bit(R5_Wantwrite, &dev->flags);
+					set_bit(R5_ReWrite, &dev->flags);
+					set_bit(R5_LOCKED, &dev->flags);
+					s.locked++;
+				} else {
+					/* let's read it back */
+					set_bit(R5_Wantread, &dev->flags);
+					set_bit(R5_LOCKED, &dev->flags);
+					s.locked++;
+				}
+			}
+		}
+
+
 	/* Finish reconstruct operations initiated by the expansion process */
 	if (sh->reconstruct_state == reconstruct_state_result) {
 		struct stripe_head *sh_src
-- 
cgit v1.2.3-70-g09d2


From 5d35e09cae47bbae2739f432658860680de21866 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: rearrange a test in fetch_block6.

Next patch will unite fetch_block5 and fetch_block6.
First I want to make the differences a little more clear.

For RAID6 if we are writing at all and there is a failed device, then
we need to load or compute every block so we can do a
reconstruct-write.
This case isn't needed for RAID5 - we will do a read-modify-write in
that case.
So make that test a separate test in fetch_block6 rather than merged
with two other tests.

Make a similar change in fetch_block5 so the one bit that is not
needed for RAID6 is clearly separate.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 74a575bbd25..a3d7cd96cfb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2326,9 +2326,8 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
 	    (dev->toread ||
 	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 	     s->syncing || s->expanding ||
-	     (s->failed &&
-	      (failed_dev->toread ||
-	       (failed_dev->towrite &&
+	     (s->failed && failed_dev->toread) ||
+	     (s->failed && failed_dev->towrite &&
 		!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
 		/* We would like to get this block, possibly by computing it,
 		 * otherwise read it if the backing disk is insync
@@ -2399,10 +2398,9 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
 	    (dev->toread ||
 	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 	     s->syncing || s->expanding ||
-	     (s->failed >= 1 &&
-	      (fdev[0]->toread || s->to_write)) ||
-	     (s->failed >= 2 &&
-	      (fdev[1]->toread || s->to_write)))) {
+	     (s->failed >= 1 && fdev[0]->toread) ||
+	     (s->failed >= 2 && fdev[1]->toread) ||
+	     (s->failed && s->to_write)) {
 		/* we would like to get this block, possibly by computing it,
 		 * otherwise read it if the backing disk is insync
 		 */
-- 
cgit v1.2.3-70-g09d2


From 93b3dbce6456a79c545b45e86ccc2244e923cc99 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: unite fetch_block5 and fetch_block6

Provided that ->failed_num[1] is not a valid device number (which is
easily achieved) fetch_block6 provides all the functionality of
fetch_block5.

So remove the latter and rename the former to simply "fetch_block".

Then handle_stripe_fill5 and handle_stripe_fill6 become the same and
can similarly be united.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 107 ++++++++++++-----------------------------------------
 1 file changed, 23 insertions(+), 84 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a3d7cd96cfb..a63a679105c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2308,91 +2308,20 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-/* fetch_block5 - checks the given member device to see if its data needs
+/* fetch_block - checks the given member device to see if its data needs
  * to be read or computed to satisfy a request.
  *
  * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill5 to continue
+ * 0 to tell the loop in handle_stripe_fill to continue
  */
-static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
-			int disk_idx, int disks)
-{
-	struct r5dev *dev = &sh->dev[disk_idx];
-	struct r5dev *failed_dev = &sh->dev[s->failed_num[0]];
-
-	/* is the data in this block needed, and can we get it? */
-	if (!test_bit(R5_LOCKED, &dev->flags) &&
-	    !test_bit(R5_UPTODATE, &dev->flags) &&
-	    (dev->toread ||
-	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-	     s->syncing || s->expanding ||
-	     (s->failed && failed_dev->toread) ||
-	     (s->failed && failed_dev->towrite &&
-		!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
-		/* We would like to get this block, possibly by computing it,
-		 * otherwise read it if the backing disk is insync
-		 */
-		if ((s->uptodate == disks - 1) &&
-		    (s->failed && disk_idx == s->failed_num[0])) {
-			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
-			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
-			set_bit(R5_Wantcompute, &dev->flags);
-			sh->ops.target = disk_idx;
-			sh->ops.target2 = -1;
-			s->req_compute = 1;
-			/* Careful: from this point on 'uptodate' is in the eye
-			 * of raid_run_ops which services 'compute' operations
-			 * before writes. R5_Wantcompute flags a block that will
-			 * be R5_UPTODATE by the time it is needed for a
-			 * subsequent operation.
-			 */
-			s->uptodate++;
-			return 1; /* uptodate + compute == disks */
-		} else if (test_bit(R5_Insync, &dev->flags)) {
-			set_bit(R5_LOCKED, &dev->flags);
-			set_bit(R5_Wantread, &dev->flags);
-			s->locked++;
-			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
-				s->syncing);
-		}
-	}
-
-	return 0;
-}
-
-/**
- * handle_stripe_fill5 - read or compute data to satisfy pending requests.
- */
-static void handle_stripe_fill5(struct stripe_head *sh,
-			struct stripe_head_state *s, int disks)
-{
-	int i;
-
-	/* look for blocks to read/compute, skip this if a compute
-	 * is already in flight, or if the stripe contents are in the
-	 * midst of changing due to a write
-	 */
-	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !sh->reconstruct_state)
-		for (i = disks; i--; )
-			if (fetch_block5(sh, s, i, disks))
-				break;
-	set_bit(STRIPE_HANDLE, &sh->state);
-}
-
-/* fetch_block6 - checks the given member device to see if its data needs
- * to be read or computed to satisfy a request.
- *
- * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill6 to continue
- */
-static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
-			int disk_idx, int disks)
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+		       int disk_idx, int disks)
 {
 	struct r5dev *dev = &sh->dev[disk_idx];
 	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
 				  &sh->dev[s->failed_num[1]] };
 
+	/* is the data in this block needed, and can we get it? */
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
 	    !test_bit(R5_UPTODATE, &dev->flags) &&
 	    (dev->toread ||
@@ -2400,7 +2329,9 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
 	     s->syncing || s->expanding ||
 	     (s->failed >= 1 && fdev[0]->toread) ||
 	     (s->failed >= 2 && fdev[1]->toread) ||
-	     (s->failed && s->to_write)) {
+	     (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
+	      !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
+	     (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
 		/* we would like to get this block, possibly by computing it,
 		 * otherwise read it if the backing disk is insync
 		 */
@@ -2420,6 +2351,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
 			sh->ops.target = disk_idx;
 			sh->ops.target2 = -1; /* no 2nd target */
 			s->req_compute = 1;
+			/* Careful: from this point on 'uptodate' is in the eye
+			 * of raid_run_ops which services 'compute' operations
+			 * before writes. R5_Wantcompute flags a block that will
+			 * be R5_UPTODATE by the time it is needed for a
+			 * subsequent operation.
+			 */
 			s->uptodate++;
 			return 1;
 		} else if (s->uptodate == disks-2 && s->failed >= 2) {
@@ -2460,11 +2397,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
 }
 
 /**
- * handle_stripe_fill6 - read or compute data to satisfy pending requests.
+ * handle_stripe_fill - read or compute data to satisfy pending requests.
  */
-static void handle_stripe_fill6(struct stripe_head *sh,
-			struct stripe_head_state *s,
-			int disks)
+static void handle_stripe_fill(struct stripe_head *sh,
+			       struct stripe_head_state *s,
+			       int disks)
 {
 	int i;
 
@@ -2475,7 +2412,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
 	    !sh->reconstruct_state)
 		for (i = disks; i--; )
-			if (fetch_block6(sh, s, i, disks))
+			if (fetch_block(sh, s, i, disks))
 				break;
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
@@ -3122,7 +3059,7 @@ static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 	 */
 	if (s->to_read || s->non_overwrite ||
 	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
-		handle_stripe_fill5(sh, s, disks);
+		handle_stripe_fill(sh, s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -3313,7 +3250,7 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 	 */
 	if (s->to_read || s->non_overwrite || (s->to_write && s->failed) ||
 	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
-		handle_stripe_fill6(sh, s, disks);
+		handle_stripe_fill(sh, s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -3397,6 +3334,8 @@ static void handle_stripe(struct stripe_head *sh)
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s.failed_num[0] = -1;
+	s.failed_num[1] = -1;
 
 	if (conf->level == 6)
 		done = handle_stripe6(sh, &s);
-- 
cgit v1.2.3-70-g09d2


From c8ac1803ff0af5aa614587ac0c66d46b7a3bdfcc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: unite handle_stripe_dirtying5 and handle_stripe_dirtying6

RAID6 is only allowed to choose 'reconstruct-write' while RAID5 is
also allow 'read-modify-write'
Apart from this difference, handle_stripe_dirtying[56] are nearly
identical.  So resolve these differences and create just one function.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 77 +++++++++++++++---------------------------------------
 1 file changed, 21 insertions(+), 56 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a63a679105c..a3018970d6a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2468,11 +2468,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-static void handle_stripe_dirtying5(raid5_conf_t *conf,
-		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
+static void handle_stripe_dirtying(raid5_conf_t *conf,
+				   struct stripe_head *sh,
+				   struct stripe_head_state *s,
+				   int disks)
 {
 	int rmw = 0, rcw = 0, i;
-	for (i = disks; i--; ) {
+	if (conf->max_degraded == 2) {
+		/* RAID6 requires 'rcw' in current implementation
+		 * Calculate the real rcw later - for now fake it
+		 * look like rcw is cheaper
+		 */
+		rcw = 1; rmw = 2;
+	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
 		if ((dev->towrite || i == sh->pd_idx) &&
@@ -2519,16 +2527,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
 				}
 			}
 		}
-	if (rcw <= rmw && rcw > 0)
+	if (rcw <= rmw && rcw > 0) {
 		/* want reconstruct write, but need to get some data */
+		rcw = 0;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
-			    i != sh->pd_idx &&
+			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags)) &&
-			    test_bit(R5_Insync, &dev->flags)) {
+			      test_bit(R5_Wantcompute, &dev->flags))) {
+				rcw++;
+				if (!test_bit(R5_Insync, &dev->flags))
+					continue; /* it's a failed drive */
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 					pr_debug("Read_old block "
@@ -2542,6 +2553,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
 				}
 			}
 		}
+	}
 	/* now if nothing is locked, and if we have enough data,
 	 * we can start a write request
 	 */
@@ -2558,53 +2570,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
 		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
-static void handle_stripe_dirtying6(raid5_conf_t *conf,
-		struct stripe_head *sh,	struct stripe_head_state *s,
-		int disks)
-{
-	int rcw = 0, pd_idx = sh->pd_idx, i;
-	int qd_idx = sh->qd_idx;
-
-	set_bit(STRIPE_HANDLE, &sh->state);
-	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
-		/* check if we haven't enough data */
-		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
-		    i != pd_idx && i != qd_idx &&
-		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		      test_bit(R5_Wantcompute, &dev->flags))) {
-			rcw++;
-			if (!test_bit(R5_Insync, &dev->flags))
-				continue; /* it's a failed drive */
-
-			if (
-			  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				pr_debug("Read_old stripe %llu "
-					"block %d for Reconstruct\n",
-				     (unsigned long long)sh->sector, i);
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantread, &dev->flags);
-				s->locked++;
-			} else {
-				pr_debug("Request delayed stripe %llu "
-					"block %d for Reconstruct\n",
-				     (unsigned long long)sh->sector, i);
-				set_bit(STRIPE_DELAYED, &sh->state);
-				set_bit(STRIPE_HANDLE, &sh->state);
-			}
-		}
-	}
-	/* now if nothing is locked, and if we have enough data, we can start a
-	 * write request
-	 */
-	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
-	    s->locked == 0 && rcw == 0 &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-		schedule_reconstruction(sh, s, 1, 0);
-	}
-}
-
 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks)
 {
@@ -3099,7 +3064,7 @@ static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 	 *    block.
 	 */
 	if (s->to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying5(conf, sh, s, disks);
+		handle_stripe_dirtying(conf, sh, s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -3288,7 +3253,7 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 	 *    block.
 	 */
 	if (s->to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying6(conf, sh, s, disks);
+		handle_stripe_dirtying(conf, sh, s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
-- 
cgit v1.2.3-70-g09d2


From 84789554e96c0263ad8aa9be91397ece1f88c768 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: move more common code into handle_stripe

Apart from 'prexor' which can only be set for RAID5, and
'qd_idx' which can only be meaningful for RAID6, these two
chunks of code are nearly the same.

So combine them into one adding a test to call either
handle_parity_checks5 or handle_parity_checks6 as appropriate.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 161 ++++++++++++++++++++---------------------------------
 1 file changed, 61 insertions(+), 100 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a3018970d6a..cf60b15b4e3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1756,7 +1756,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 	/*
 	 * Select the parity disk based on the user selected algorithm.
 	 */
-	pd_idx = qd_idx = ~0;
+	pd_idx = qd_idx = -1;
 	switch(conf->level) {
 	case 4:
 		pd_idx = data_disks;
@@ -2903,7 +2903,6 @@ static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	struct r5dev *dev;
-	int prexor;
 
 	/* Now to look around and see what can be done */
 	rcu_read_lock();
@@ -3026,56 +3025,6 @@ static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
 		handle_stripe_fill(sh, s, disks);
 
-	/* Now we check to see if any write operations have recently
-	 * completed
-	 */
-	prexor = 0;
-	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
-		prexor = 1;
-	if (sh->reconstruct_state == reconstruct_state_drain_result ||
-	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
-		sh->reconstruct_state = reconstruct_state_idle;
-
-		/* All the 'written' buffers and the parity block are ready to
-		 * be written back to disk
-		 */
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
-		for (i = disks; i--; ) {
-			dev = &sh->dev[i];
-			if (test_bit(R5_LOCKED, &dev->flags) &&
-				(i == sh->pd_idx || dev->written)) {
-				pr_debug("Writing block %d\n", i);
-				set_bit(R5_Wantwrite, &dev->flags);
-				if (prexor)
-					continue;
-				if (!test_bit(R5_Insync, &dev->flags) ||
-				    (i == sh->pd_idx && s->failed == 0))
-					set_bit(STRIPE_INSYNC, &sh->state);
-			}
-		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-			s->dec_preread_active = 1;
-	}
-
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
-	 * 1/ A 'write' operation (copy+xor) is already in flight.
-	 * 2/ A 'check' operation is in flight, as it may clobber the parity
-	 *    block.
-	 */
-	if (s->to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying(conf, sh, s, disks);
-
-	/* maybe we need to check and possibly fix the parity for this stripe
-	 * Any reads will already have been scheduled, so we just see if enough
-	 * data is available.  The parity check is held off while parity
-	 * dependent operations are in flight.
-	 */
-	if (sh->check_state ||
-	    (s->syncing && s->locked == 0 &&
-	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
-	     !test_bit(STRIPE_INSYNC, &sh->state)))
-		handle_parity_checks5(conf, sh, s, disks);
 	return 0;
 }
 
@@ -3217,54 +3166,6 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
 		handle_stripe_fill(sh, s, disks);
 
-	/* Now we check to see if any write operations have recently
-	 * completed
-	 */
-	if (sh->reconstruct_state == reconstruct_state_drain_result) {
-
-		sh->reconstruct_state = reconstruct_state_idle;
-		/* All the 'written' buffers and the parity blocks are ready to
-		 * be written back to disk
-		 */
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
-		for (i = disks; i--; ) {
-			dev = &sh->dev[i];
-			if (test_bit(R5_LOCKED, &dev->flags) &&
-			    (i == sh->pd_idx || i == qd_idx ||
-			     dev->written)) {
-				pr_debug("Writing block %d\n", i);
-				BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
-				set_bit(R5_Wantwrite, &dev->flags);
-				if (!test_bit(R5_Insync, &dev->flags) ||
-				    ((i == sh->pd_idx || i == qd_idx) &&
-				      s->failed == 0))
-					set_bit(STRIPE_INSYNC, &sh->state);
-			}
-		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-			s->dec_preread_active = 1;
-	}
-
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
-	 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
-	 * 2/ A 'check' operation is in flight, as it may clobber the parity
-	 *    block.
-	 */
-	if (s->to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying(conf, sh, s, disks);
-
-	/* maybe we need to check and possibly fix the parity for this stripe
-	 * Any reads will already have been scheduled, so we just see if enough
-	 * data is available.  The parity check is held off while parity
-	 * dependent operations are in flight.
-	 */
-	if (sh->check_state ||
-	    (s->syncing && s->locked == 0 &&
-	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
-	     !test_bit(STRIPE_INSYNC, &sh->state)))
-		handle_parity_checks6(conf, sh, s, disks);
 	return 0;
 }
 
@@ -3273,6 +3174,8 @@ static void handle_stripe(struct stripe_head *sh)
 	struct stripe_head_state s;
 	int done;
 	int i;
+	int prexor;
+	int disks = sh->disks;
 	raid5_conf_t *conf = sh->raid_conf;
 
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -3310,6 +3213,64 @@ static void handle_stripe(struct stripe_head *sh)
 	if (done)
 		goto finish;
 
+	/* Now we check to see if any write operations have recently
+	 * completed
+	 */
+	prexor = 0;
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+		prexor = 1;
+	if (sh->reconstruct_state == reconstruct_state_drain_result ||
+	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
+
+		/* All the 'written' buffers and the parity block are ready to
+		 * be written back to disk
+		 */
+		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+		BUG_ON(sh->qd_idx >= 0 &&
+		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_LOCKED, &dev->flags) &&
+				(i == sh->pd_idx || i == sh->qd_idx ||
+				 dev->written)) {
+				pr_debug("Writing block %d\n", i);
+				set_bit(R5_Wantwrite, &dev->flags);
+				if (prexor)
+					continue;
+				if (!test_bit(R5_Insync, &dev->flags) ||
+				    ((i == sh->pd_idx || i == sh->qd_idx)  &&
+				     s.failed == 0))
+					set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			s.dec_preread_active = 1;
+	}
+
+	/* Now to consider new write requests and what else, if anything
+	 * should be read.  We do not handle new writes when:
+	 * 1/ A 'write' operation (copy+xor) is already in flight.
+	 * 2/ A 'check' operation is in flight, as it may clobber the parity
+	 *    block.
+	 */
+	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
+		handle_stripe_dirtying(conf, sh, &s, disks);
+
+	/* maybe we need to check and possibly fix the parity for this stripe
+	 * Any reads will already have been scheduled, so we just see if enough
+	 * data is available.  The parity check is held off while parity
+	 * dependent operations are in flight.
+	 */
+	if (sh->check_state ||
+	    (s.syncing && s.locked == 0 &&
+	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+	     !test_bit(STRIPE_INSYNC, &sh->state))) {
+		if (conf->level == 6)
+			handle_parity_checks6(conf, sh, &s, disks);
+		else
+			handle_parity_checks5(conf, sh, &s, disks);
+	}
 
 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
-- 
cgit v1.2.3-70-g09d2


From 474af965fe0005b334cabdb2904a7d712c21489b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: move some more common code into handle_stripe

The RAID6 version of this code is usable for RAID5 providing:
  - we test "conf->max_degraded" rather than "2" as appropriate
  - we make sure s->failed_num[1] is meaningful (and not '-1')
    when s->failed > 1

The 'return 1' must become 'goto finish' in the new location.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 180 ++++++++++++++++++++---------------------------------
 1 file changed, 67 insertions(+), 113 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cf60b15b4e3..63acc51e840 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2968,63 +2968,14 @@ static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
 		if (test_bit(R5_ReadError, &dev->flags))
 			clear_bit(R5_Insync, &dev->flags);
 		if (!test_bit(R5_Insync, &dev->flags)) {
+			if (s->failed < 2)
+				s->failed_num[s->failed] = i;
 			s->failed++;
-			s->failed_num[0] = i;
 		}
 	}
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
-	if (unlikely(s->blocked_rdev)) {
-		if (s->syncing || s->expanding || s->expanded ||
-		    s->to_write || s->written) {
-			set_bit(STRIPE_HANDLE, &sh->state);
-			return 1;
-		}
-		/* There is nothing for the blocked_rdev to block */
-		rdev_dec_pending(s->blocked_rdev, conf->mddev);
-		s->blocked_rdev = NULL;
-	}
-
-	if (s->to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
-		set_bit(STRIPE_OP_BIOFILL, &s->ops_request);
-		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
-	}
-
-	pr_debug("locked=%d uptodate=%d to_read=%d"
-		" to_write=%d failed=%d failed_num=%d\n",
-		s->locked, s->uptodate, s->to_read, s->to_write,
-		s->failed, s->failed_num[0]);
-	/* check if the array has lost two devices and, if so, some requests might
-	 * need to be failed
-	 */
-	if (s->failed > 1 && s->to_read+s->to_write+s->written)
-		handle_failed_stripe(conf, sh, s, disks, &s->return_bi);
-	if (s->failed > 1 && s->syncing) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-		s->syncing = 0;
-	}
-
-	/* might be able to return some write requests if the parity block
-	 * is safe, or on a failed drive
-	 */
-	dev = &sh->dev[sh->pd_idx];
-	if (s->written &&
-	    ((test_bit(R5_Insync, &dev->flags) &&
-	      !test_bit(R5_LOCKED, &dev->flags) &&
-	      test_bit(R5_UPTODATE, &dev->flags)) ||
-	     (s->failed == 1 && s->failed_num[0] == sh->pd_idx)))
-		handle_stripe_clean_event(conf, sh, disks, &s->return_bi);
-
-	/* Now we might consider reading some blocks, either to check/generate
-	 * parity, or to satisfy requests
-	 * or to load a block that is being partially written.
-	 */
-	if (s->to_read || s->non_overwrite ||
-	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
-		handle_stripe_fill(sh, s, disks);
-
 	return 0;
 }
 
@@ -3032,8 +2983,8 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
-	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
-	struct r5dev *dev, *pdev, *qdev;
+	struct r5dev *dev;
+	int i;
 
 	/* Now to look around and see what can be done */
 
@@ -3107,65 +3058,6 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
 
-	if (unlikely(s->blocked_rdev)) {
-		if (s->syncing || s->expanding || s->expanded ||
-		    s->to_write || s->written) {
-			set_bit(STRIPE_HANDLE, &sh->state);
-			return 1;
-		}
-		/* There is nothing for the blocked_rdev to block */
-		rdev_dec_pending(s->blocked_rdev, conf->mddev);
-		s->blocked_rdev = NULL;
-	}
-
-	if (s->to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
-		set_bit(STRIPE_OP_BIOFILL, &s->ops_request);
-		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
-	}
-
-	pr_debug("locked=%d uptodate=%d to_read=%d"
-	       " to_write=%d failed=%d failed_num=%d,%d\n",
-	       s->locked, s->uptodate, s->to_read, s->to_write, s->failed,
-	       s->failed_num[0], s->failed_num[1]);
-	/* check if the array has lost >2 devices and, if so, some requests
-	 * might need to be failed
-	 */
-	if (s->failed > 2 && s->to_read+s->to_write+s->written)
-		handle_failed_stripe(conf, sh, s, disks, &s->return_bi);
-	if (s->failed > 2 && s->syncing) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-		s->syncing = 0;
-	}
-
-	/*
-	 * might be able to return some write requests if the parity blocks
-	 * are safe, or on a failed drive
-	 */
-	pdev = &sh->dev[pd_idx];
-	s->p_failed = (s->failed >= 1 && s->failed_num[0] == pd_idx)
-		|| (s->failed >= 2 && s->failed_num[1] == pd_idx);
-	qdev = &sh->dev[qd_idx];
-	s->q_failed = (s->failed >= 1 && s->failed_num[0] == qd_idx)
-		|| (s->failed >= 2 && s->failed_num[1] == qd_idx);
-
-	if (s->written &&
-	    (s->p_failed || ((test_bit(R5_Insync, &pdev->flags)
-			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
-	    (s->q_failed || ((test_bit(R5_Insync, &qdev->flags)
-			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && test_bit(R5_UPTODATE, &qdev->flags)))))
-		handle_stripe_clean_event(conf, sh, disks, &s->return_bi);
-
-	/* Now we might consider reading some blocks, either to check/generate
-	 * parity, or to satisfy requests
-	 * or to load a block that is being partially written.
-	 */
-	if (s->to_read || s->non_overwrite || (s->to_write && s->failed) ||
-	    (s->syncing && (s->uptodate + s->compute < disks)) || s->expanding)
-		handle_stripe_fill(sh, s, disks);
-
 	return 0;
 }
 
@@ -3173,10 +3065,11 @@ static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
 	int done;
+	raid5_conf_t *conf = sh->raid_conf;
 	int i;
 	int prexor;
 	int disks = sh->disks;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5dev *pdev, *qdev;
 
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
@@ -3213,6 +3106,67 @@ static void handle_stripe(struct stripe_head *sh)
 	if (done)
 		goto finish;
 
+	if (unlikely(s.blocked_rdev)) {
+		if (s.syncing || s.expanding || s.expanded ||
+		    s.to_write || s.written) {
+			set_bit(STRIPE_HANDLE, &sh->state);
+			goto finish;
+		}
+		/* There is nothing for the blocked_rdev to block */
+		rdev_dec_pending(s.blocked_rdev, conf->mddev);
+		s.blocked_rdev = NULL;
+	}
+
+	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+	}
+
+	pr_debug("locked=%d uptodate=%d to_read=%d"
+	       " to_write=%d failed=%d failed_num=%d,%d\n",
+	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
+	       s.failed_num[0], s.failed_num[1]);
+	/* check if the array has lost more than max_degraded devices and,
+	 * if so, some requests might need to be failed.
+	 */
+	if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
+		handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+	if (s.failed > conf->max_degraded && s.syncing) {
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+		s.syncing = 0;
+	}
+
+	/*
+	 * might be able to return some write requests if the parity blocks
+	 * are safe, or on a failed drive
+	 */
+	pdev = &sh->dev[sh->pd_idx];
+	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+	qdev = &sh->dev[sh->qd_idx];
+	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+		|| conf->level < 6;
+
+	if (s.written &&
+	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+			     && !test_bit(R5_LOCKED, &pdev->flags)
+			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+			     && !test_bit(R5_LOCKED, &qdev->flags)
+			     && test_bit(R5_UPTODATE, &qdev->flags)))))
+		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+	/* Now we might consider reading some blocks, either to check/generate
+	 * parity, or to satisfy requests
+	 * or to load a block that is being partially written.
+	 */
+	if (s.to_read || s.non_overwrite
+	    || (conf->level == 6 && s.to_write && s.failed)
+	    || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
+		handle_stripe_fill(sh, &s, disks);
+
 	/* Now we check to see if any write operations have recently
 	 * completed
 	 */
-- 
cgit v1.2.3-70-g09d2


From acfe726bdd0000a9be1b308b29fad1e9ae62178c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: finalise new merged handle_stripe.

handle_stripe5() and handle_stripe6() are now virtually identical.
So discard one and rename the other to 'analyse_stripe()'.

It always returns 0, so change it to 'void' and remove the 'done'
variable in handle_stripe().

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid5.c | 115 ++++++-----------------------------------------------
 1 file changed, 13 insertions(+), 102 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 63acc51e840..b321d6c3659 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2898,100 +2898,27 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
  *
  */
 
-static int handle_stripe5(struct stripe_head *sh, struct stripe_head_state *s)
-{
-	raid5_conf_t *conf = sh->raid_conf;
-	int disks = sh->disks, i;
-	struct r5dev *dev;
-
-	/* Now to look around and see what can be done */
-	rcu_read_lock();
-	spin_lock_irq(&conf->device_lock);
-	for (i=disks; i--; ) {
-		mdk_rdev_t *rdev;
-
-		dev = &sh->dev[i];
-
-		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
-			"written %p\n",	i, dev->flags, dev->toread, dev->read,
-			dev->towrite, dev->written);
-
-		/* maybe we can request a biofill operation
-		 *
-		 * new wantfill requests are only permitted while
-		 * ops_complete_biofill is guaranteed to be inactive
-		 */
-		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
-		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
-			set_bit(R5_Wantfill, &dev->flags);
-
-		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags))
-			s->locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags))
-			s->uptodate++;
-		if (test_bit(R5_Wantcompute, &dev->flags))
-			s->compute++;
-
-		if (test_bit(R5_Wantfill, &dev->flags))
-			s->to_fill++;
-		else if (dev->toread)
-			s->to_read++;
-		if (dev->towrite) {
-			s->to_write++;
-			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				s->non_overwrite++;
-		}
-		if (dev->written)
-			s->written++;
-		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (s->blocked_rdev == NULL &&
-		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			s->blocked_rdev = rdev;
-			atomic_inc(&rdev->nr_pending);
-		}
-		clear_bit(R5_Insync, &dev->flags);
-		if (!rdev)
-			/* Not in-sync */;
-		else if (test_bit(In_sync, &rdev->flags))
-			set_bit(R5_Insync, &dev->flags);
-		else {
-			/* could be in-sync depending on recovery/reshape status */
-			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
-				set_bit(R5_Insync, &dev->flags);
-		}
-		if (!test_bit(R5_Insync, &dev->flags)) {
-			/* The ReadError flag will just be confusing now */
-			clear_bit(R5_ReadError, &dev->flags);
-			clear_bit(R5_ReWrite, &dev->flags);
-		}
-		if (test_bit(R5_ReadError, &dev->flags))
-			clear_bit(R5_Insync, &dev->flags);
-		if (!test_bit(R5_Insync, &dev->flags)) {
-			if (s->failed < 2)
-				s->failed_num[s->failed] = i;
-			s->failed++;
-		}
-	}
-	spin_unlock_irq(&conf->device_lock);
-	rcu_read_unlock();
-
-	return 0;
-}
-
-static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
+static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
 	struct r5dev *dev;
 	int i;
 
-	/* Now to look around and see what can be done */
+	memset(s, 0, sizeof(*s));
 
+	s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s->failed_num[0] = -1;
+	s->failed_num[1] = -1;
+
+	/* Now to look around and see what can be done */
 	rcu_read_lock();
 	spin_lock_irq(&conf->device_lock);
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
+
 		dev = &sh->dev[i];
 
 		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
@@ -3015,9 +2942,9 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 			BUG_ON(s->compute > 2);
 		}
 
-		if (test_bit(R5_Wantfill, &dev->flags)) {
+		if (test_bit(R5_Wantfill, &dev->flags))
 			s->to_fill++;
-		} else if (dev->toread)
+		else if (dev->toread)
 			s->to_read++;
 		if (dev->towrite) {
 			s->to_write++;
@@ -3057,14 +2984,11 @@ static int handle_stripe6(struct stripe_head *sh, struct stripe_head_state *s)
 	}
 	spin_unlock_irq(&conf->device_lock);
 	rcu_read_unlock();
-
-	return 0;
 }
 
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
-	int done;
 	raid5_conf_t *conf = sh->raid_conf;
 	int i;
 	int prexor;
@@ -3090,21 +3014,8 @@ static void handle_stripe(struct stripe_head *sh)
 	       (unsigned long long)sh->sector, sh->state,
 	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
 	       sh->check_state, sh->reconstruct_state);
-	memset(&s, 0, sizeof(s));
-
-	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-	s.failed_num[0] = -1;
-	s.failed_num[1] = -1;
-
-	if (conf->level == 6)
-		done = handle_stripe6(sh, &s);
-	else
-		done = handle_stripe5(sh, &s);
 
-	if (done)
-		goto finish;
+	analyse_stripe(sh, &s);
 
 	if (unlikely(s.blocked_rdev)) {
 		if (s.syncing || s.expanding || s.expanded ||
-- 
cgit v1.2.3-70-g09d2


From 8bda470e8ebde35f9349e98ecbce4dfb508a60fa Mon Sep 17 00:00:00 2001
From: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid: use printk_ratelimited instead of printk_ratelimit

As per printk_ratelimit comment, it should not be used.

Signed-off-by: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 25 +++++++++++++++----------
 drivers/md/raid10.c | 23 +++++++++++++----------
 drivers/md/raid5.c  | 47 +++++++++++++++++++++++++----------------------
 3 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d844..d3a8f4bb4fc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
@@ -287,10 +288,13 @@ static void raid1_end_read_request(struct bio *bio, int error)
 		 * oops, read error:
 		 */
 		char b[BDEVNAME_SIZE];
-		if (printk_ratelimit())
-			printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
-			       mdname(conf->mddev),
-			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+		printk_ratelimited(
+			KERN_ERR "md/raid1:%s: %s: "
+			"rescheduling sector %llu\n",
+			mdname(conf->mddev),
+			bdevname(conf->mirrors[mirror].rdev->bdev,
+				 b),
+			(unsigned long long)r1_bio->sector);
 		reschedule_retry(r1_bio);
 	}
 
@@ -1580,12 +1584,13 @@ static void raid1d(mddev_t *mddev)
 						      GFP_NOIO, mddev);
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
-				if (printk_ratelimit())
-					printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
-					       " other mirror: %s\n",
-					       mdname(mddev),
-					       (unsigned long long)r1_bio->sector,
-					       bdevname(rdev->bdev,b));
+				printk_ratelimited(
+					KERN_ERR
+					"md/raid1:%s: redirecting sector %llu"
+					" to other mirror: %s\n",
+					mdname(mddev),
+					(unsigned long long)r1_bio->sector,
+					bdevname(rdev->bdev, b));
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3715e220e5e..1725ec1e1e8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
@@ -301,10 +302,11 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * oops, read error - keep the refcount on the rdev
 		 */
 		char b[BDEVNAME_SIZE];
-		if (printk_ratelimit())
-			printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
-			       mdname(conf->mddev),
-			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+		printk_ratelimited(KERN_ERR
+				   "md/raid10:%s: %s: rescheduling sector %llu\n",
+				   mdname(conf->mddev),
+				   bdevname(conf->mirrors[dev].rdev->bdev, b),
+				   (unsigned long long)r10_bio->sector);
 		reschedule_retry(r10_bio);
 	}
 }
@@ -1669,12 +1671,13 @@ static void raid10d(mddev_t *mddev)
 				bio_put(bio);
 				slot = r10_bio->read_slot;
 				rdev = conf->mirrors[mirror].rdev;
-				if (printk_ratelimit())
-					printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
-					       " another mirror\n",
-					       mdname(mddev),
-					       bdevname(rdev->bdev,b),
-					       (unsigned long long)r10_bio->sector);
+				printk_ratelimited(
+					KERN_ERR
+					"md/raid10:%s: %s: redirecting"
+					"sector %llu to another mirror\n",
+					mdname(mddev),
+					bdevname(rdev->bdev, b),
+					(unsigned long long)r10_bio->sector);
 				bio = bio_clone_mddev(r10_bio->master_bio,
 						      GFP_NOIO, mddev);
 				r10_bio->devs[slot].bio = bio;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b321d6c3659..467e8e1cd3d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
@@ -96,8 +97,6 @@
 #define __inline__
 #endif
 
-#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -1583,12 +1582,14 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			rdev = conf->disks[i].rdev;
-			printk_rl(KERN_INFO "md/raid:%s: read error corrected"
-				  " (%lu sectors at %llu on %s)\n",
-				  mdname(conf->mddev), STRIPE_SECTORS,
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdevname(rdev->bdev, b));
+			printk_ratelimited(
+				KERN_INFO
+				"md/raid:%s: read error corrected"
+				" (%lu sectors at %llu on %s)\n",
+				mdname(conf->mddev), STRIPE_SECTORS,
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdevname(rdev->bdev, b));
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
@@ -1602,22 +1603,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
 		if (conf->mddev->degraded >= conf->max_degraded)
-			printk_rl(KERN_WARNING
-				  "md/raid:%s: read error not correctable "
-				  "(sector %llu on %s).\n",
-				  mdname(conf->mddev),
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdn);
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error not correctable "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
-			printk_rl(KERN_WARNING
-				  "md/raid:%s: read error NOT corrected!! "
-				  "(sector %llu on %s).\n",
-				  mdname(conf->mddev),
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdn);
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error NOT corrected!! "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
 		else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
 			printk(KERN_WARNING
-- 
cgit v1.2.3-70-g09d2


From 36fad858a7404a9656122a9e560a224ae2a00979 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md: introduce link/unlink_rdev() helpers

There are places where sysfs links to rdev are handled
in a same way. Add the helper functions to consolidate
them.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 47 ++++++++++++++---------------------------------
 drivers/md/md.h    | 14 ++++++++++++++
 drivers/md/raid1.c | 15 +++++----------
 drivers/md/raid5.c | 10 +++-------
 4 files changed, 36 insertions(+), 50 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 91e31e260b4..0398dc42a95 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2459,7 +2459,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
 	char *e;
 	int err;
-	char nm[20];
 	int slot = simple_strtoul(buf, &e, 10);
 	if (strncmp(buf, "none", 4)==0)
 		slot = -1;
@@ -2482,8 +2481,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			hot_remove_disk(rdev->mddev, rdev->raid_disk);
 		if (err)
 			return err;
-		sprintf(nm, "rd%d", rdev->raid_disk);
-		sysfs_remove_link(&rdev->mddev->kobj, nm);
+		sysfs_unlink_rdev(rdev->mddev, rdev);
 		rdev->raid_disk = -1;
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2520,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			return err;
 		} else
 			sysfs_notify_dirent_safe(rdev->sysfs_state);
-		sprintf(nm, "rd%d", rdev->raid_disk);
-		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+		if (sysfs_link_rdev(rdev->mddev, rdev))
 			/* failure here is OK */;
 		/* don't wakeup anyone, leave that to userspace. */
 	} else {
@@ -3149,15 +3146,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	}
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		char nm[20];
 		if (rdev->raid_disk < 0)
 			continue;
 		if (rdev->new_raid_disk >= mddev->raid_disks)
 			rdev->new_raid_disk = -1;
 		if (rdev->new_raid_disk == rdev->raid_disk)
 			continue;
-		sprintf(nm, "rd%d", rdev->raid_disk);
-		sysfs_remove_link(&mddev->kobj, nm);
+		sysfs_unlink_rdev(mddev, rdev);
 	}
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->raid_disk < 0)
@@ -3168,11 +3163,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 		if (rdev->raid_disk < 0)
 			clear_bit(In_sync, &rdev->flags);
 		else {
-			char nm[20];
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
-				printk("md: cannot register %s for %s after level change\n",
-				       nm, mdname(mddev));
+			if (sysfs_link_rdev(mddev, rdev))
+				printk(KERN_WARNING "md: cannot register rd%d"
+				       " for %s after level change\n",
+				       rdev->raid_disk, mdname(mddev));
 		}
 	}
 
@@ -4621,12 +4615,9 @@ int md_run(mddev_t *mddev)
 	smp_wmb();
 	mddev->ready = 1;
 	list_for_each_entry(rdev, &mddev->disks, same_set)
-		if (rdev->raid_disk >= 0) {
-			char nm[20];
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+		if (rdev->raid_disk >= 0)
+			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
-		}
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	
@@ -4854,11 +4845,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 
 		list_for_each_entry(rdev, &mddev->disks, same_set)
-			if (rdev->raid_disk >= 0) {
-				char nm[20];
-				sprintf(nm, "rd%d", rdev->raid_disk);
-				sysfs_remove_link(&mddev->kobj, nm);
-			}
+			if (rdev->raid_disk >= 0)
+				sysfs_unlink_rdev(mddev, rdev);
 
 		set_capacity(disk, 0);
 		mutex_unlock(&mddev->open_mutex);
@@ -7077,9 +7065,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		    atomic_read(&rdev->nr_pending)==0) {
 			if (mddev->pers->hot_remove_disk(
 				    mddev, rdev->raid_disk)==0) {
-				char nm[20];
-				sprintf(nm,"rd%d", rdev->raid_disk);
-				sysfs_remove_link(&mddev->kobj, nm);
+				sysfs_unlink_rdev(mddev, rdev);
 				rdev->raid_disk = -1;
 			}
 		}
@@ -7096,10 +7082,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 				rdev->recovery_offset = 0;
 				if (mddev->pers->
 				    hot_add_disk(mddev, rdev) == 0) {
-					char nm[20];
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					if (sysfs_create_link(&mddev->kobj,
-							      &rdev->kobj, nm))
+					if (sysfs_link_rdev(mddev, rdev))
 						/* failure here is OK */;
 					spares++;
 					md_new_event(mddev);
@@ -7219,9 +7202,7 @@ void md_check_recovery(mddev_t *mddev)
 				    atomic_read(&rdev->nr_pending)==0) {
 					if (mddev->pers->hot_remove_disk(
 						    mddev, rdev->raid_disk)==0) {
-						char nm[20];
-						sprintf(nm,"rd%d", rdev->raid_disk);
-						sysfs_remove_link(&mddev->kobj, nm);
+						sysfs_unlink_rdev(mddev, rdev);
 						rdev->raid_disk = -1;
 					}
 				}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c26c7a08ae..6863f722cd2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -413,6 +413,20 @@ static inline char * mdname (mddev_t * mddev)
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
 
+static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	char nm[20];
+	sprintf(nm, "rd%d", rdev->raid_disk);
+	return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+}
+
+static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	char nm[20];
+	sprintf(nm, "rd%d", rdev->raid_disk);
+	sysfs_remove_link(&mddev->kobj, nm);
+}
+
 /*
  * iterates through some rdev ringlist. It's safe to remove the
  * current 'rdev'. Dont touch 'tmp' though.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d3a8f4bb4fc..1d79a041db0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2159,18 +2159,13 @@ static int raid1_reshape(mddev_t *mddev)
 	for (d = d2 = 0; d < conf->raid_disks; d++) {
 		mdk_rdev_t *rdev = conf->mirrors[d].rdev;
 		if (rdev && rdev->raid_disk != d2) {
-			char nm[20];
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			sysfs_remove_link(&mddev->kobj, nm);
+			sysfs_unlink_rdev(mddev, rdev);
 			rdev->raid_disk = d2;
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			sysfs_remove_link(&mddev->kobj, nm);
-			if (sysfs_create_link(&mddev->kobj,
-					      &rdev->kobj, nm))
+			sysfs_unlink_rdev(mddev, rdev);
+			if (sysfs_link_rdev(mddev, rdev))
 				printk(KERN_WARNING
-				       "md/raid1:%s: cannot register "
-				       "%s\n",
-				       mdname(mddev), nm);
+				       "md/raid1:%s: cannot register rd%d\n",
+				       mdname(mddev), rdev->raid_disk);
 		}
 		if (rdev)
 			newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 467e8e1cd3d..0cd591472e1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5152,16 +5152,14 @@ static int raid5_start_reshape(mddev_t *mddev)
 			if (rdev->raid_disk < 0 &&
 			    !test_bit(Faulty, &rdev->flags)) {
 				if (raid5_add_disk(mddev, rdev) == 0) {
-					char nm[20];
 					if (rdev->raid_disk
 					    >= conf->previous_raid_disks) {
 						set_bit(In_sync, &rdev->flags);
 						added_devices++;
 					} else
 						rdev->recovery_offset = 0;
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					if (sysfs_create_link(&mddev->kobj,
-							      &rdev->kobj, nm))
+
+					if (sysfs_link_rdev(mddev, rdev))
 						/* Failure here is OK */;
 				}
 			} else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5257,9 +5255,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 			     d++) {
 				mdk_rdev_t *rdev = conf->disks[d].rdev;
 				if (rdev && raid5_remove_disk(mddev, d) == 0) {
-					char nm[20];
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					sysfs_remove_link(&mddev->kobj, nm);
+					sysfs_unlink_rdev(mddev, rdev);
 					rdev->raid_disk = -1;
 				}
 			}
-- 
cgit v1.2.3-70-g09d2


From ddd5115fe5594f5aae3c7f0008a5327bb1d19397 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: move rdev->corrected_errors counting

Read errors are considered to corrected if write-back and re-read
cycle is finished without further problems. Thus moving the rdev->
corrected_errors counting after the re-reading looks more reasonable
IMHO.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0cd591472e1..a81eca6434d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -547,10 +547,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
-			if ((rw & WRITE) &&
-			    test_bit(R5_ReWrite, &sh->dev[i].flags))
-				atomic_add(STRIPE_SECTORS,
-					&rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw & WRITE)
@@ -1590,6 +1586,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				(unsigned long long)(sh->sector
 						     + rdev->data_offset),
 				bdevname(rdev->bdev, b));
+			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
-- 
cgit v1.2.3-70-g09d2


From 8cfa7b0f67b4d899efc7f39eb7e172fd79237811 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid5: Avoid BUG caused by multiple failures.

While preparing to write a stripe we keep the parity block or blocks
locked (R5_LOCKED) - towards the end of schedule_reconstruction.

If the array is discovered to have failed before this write completes
we can leave those blocks LOCKED, and init_stripe will notice that a
free stripe still has a locked block and will complain.

So clear the R5_LOCKED flag in handle_failed_stripe, and demote the
'BUG' to a 'WARN_ON'.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a81eca6434d..b874f42694e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -340,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 			       (unsigned long long)sh->sector, i, dev->toread,
 			       dev->read, dev->towrite, dev->written,
 			       test_bit(R5_LOCKED, &dev->flags));
-			BUG();
+			WARN_ON(1);
 		}
 		dev->flags = 0;
 		raid5_build_block(sh, i, previous);
@@ -2301,6 +2301,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 		if (bitmap_end)
 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 					STRIPE_SECTORS, 0, 0);
+		/* If we were in the middle of a write the parity block might
+		 * still be locked - so just clear all R5_LOCKED flags
+		 */
+		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	}
 
 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
-- 
cgit v1.2.3-70-g09d2


From 34b343cff4354ab9864be83be88405fd53d928a0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:47 +1000
Subject: md: don't allow arrays to contain devices with bad blocks.

As no personality understand bad block lists yet, we must
reject any device that is known to contain bad blocks.
As the personalities get taught, these tests can be removed.

This only applies to raid1/raid5/raid10.
For linear/raid0/multipath/faulty the whole concept of bad blocks
doesn't mean anything so there is no point adding the checks.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c  | 7 +++++++
 drivers/md/raid10.c | 8 ++++++++
 drivers/md/raid5.c  | 7 +++++++
 3 files changed, 22 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3cbf0ac2aaa..8db311d7cdd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1055,6 +1055,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1994,6 +1997,10 @@ static int run(mddev_t *mddev)
 			blk_queue_segment_boundary(mddev->queue,
 						   PAGE_CACHE_SIZE - 1);
 		}
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid1: Cannot handle bad blocks yet\n");
+			return -EINVAL;
+		}
 	}
 
 	mddev->degraded = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5def27c28be..8aadd2f52dc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1101,6 +1101,9 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
@@ -2263,6 +2266,11 @@ static int run(mddev_t *mddev)
 				 (conf->raid_disks / conf->near_copies));
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid10: cannot handle bad blocks yet\n");
+			goto out_free_conf;
+		}
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b874f42694e..719445004dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4667,6 +4667,10 @@ static int run(mddev_t *mddev)
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid5: cannot handle bad blocks yet\n");
+			goto abort;
+		}
 		if (rdev->raid_disk < 0)
 			continue;
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -4975,6 +4979,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From de393cdea66cbd63c90725663f400c76faf1b255 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:48 +1000
Subject: md: make it easier to wait for bad blocks to be acknowledged.

It is only safe to choose not to write to a bad block if that bad
block is safely recorded in metadata - i.e. if it has been
'acknowledged'.

If it hasn't we need to wait for the acknowledgement.

We support that using rdev->blocked wait and
md_wait_for_blocked_rdev by introducing a new device flag
'BlockedBadBlock'.

This flag is only advisory.
It is cleared whenever we acknowledge a bad block, so that a waiter
can re-check the particular bad blocks that it is interested it.

It should be set by a caller when they find they need to wait.
This (set after test) is inherently racy, but as
md_wait_for_blocked_rdev already has a timeout, losing the race will
have minimal impact.

When we clear "Blocked" was also clear "BlockedBadBlocks" incase it
was set incorrectly (see above race).

We also modify the way we manage 'Blocked' to fit better with the new
handling of 'BlockedBadBlocks' and to make it consistent between
externally managed and internally managed metadata.   This requires
that each raidXd loop checks if the metadata needs to be written and
triggers a write (md_check_recovery) if needed.  Otherwise a queued
write request might cause raidXd to wait for the metadata to write,
and only that thread can write it.

Before writing metadata, we set FaultRecorded for all devices that
are Faulty, then after writing the metadata we clear Blocked for any
device for which the Fault was certainly Recorded.

The 'faulty' device flag now appears in sysfs if the device is faulty
*or* it has unacknowledged bad blocks.  So user-space which does not
understand bad blocks can continue to function correctly.
User space which does, should not assume a device is faulty until it
sees the 'faulty' flag, and then sees the list of unacknowledged bad
blocks is empty.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 77 ++++++++++++++++++++++++++++++++++++-----------------
 drivers/md/md.h     | 25 +++++++++++++++--
 drivers/md/raid1.c  |  3 +++
 drivers/md/raid10.c |  3 +++
 drivers/md/raid5.c  |  4 +++
 5 files changed, 85 insertions(+), 27 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1520d18c5af..a6b6471da2b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2341,8 +2341,18 @@ repeat:
 	if (!mddev->persistent) {
 		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
 		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
-		if (!mddev->external)
+		if (!mddev->external) {
 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			list_for_each_entry(rdev, &mddev->disks, same_set) {
+				if (rdev->badblocks.changed) {
+					md_ack_all_badblocks(&rdev->badblocks);
+					md_error(mddev, rdev);
+				}
+				clear_bit(Blocked, &rdev->flags);
+				clear_bit(BlockedBadBlocks, &rdev->flags);
+				wake_up(&rdev->blocked_wait);
+			}
+		}
 		wake_up(&mddev->sb_wait);
 		return;
 	}
@@ -2399,9 +2409,12 @@ repeat:
 		mddev->events --;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->badblocks.changed)
 			any_badblocks_changed++;
+		if (test_bit(Faulty, &rdev->flags))
+			set_bit(FaultRecorded, &rdev->flags);
+	}
 
 	sync_sbs(mddev, nospares);
 	spin_unlock_irq(&mddev->write_lock);
@@ -2458,9 +2471,15 @@ repeat:
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
-	if (any_badblocks_changed)
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+			clear_bit(Blocked, &rdev->flags);
+
+		if (any_badblocks_changed)
 			md_ack_all_badblocks(&rdev->badblocks);
+		clear_bit(BlockedBadBlocks, &rdev->flags);
+		wake_up(&rdev->blocked_wait);
+	}
 }
 
 /* words written to sysfs files may, or may not, be \n terminated.
@@ -2495,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
 	char *sep = "";
 	size_t len = 0;
 
-	if (test_bit(Faulty, &rdev->flags)) {
+	if (test_bit(Faulty, &rdev->flags) ||
+	    rdev->badblocks.unacked_exist) {
 		len+= sprintf(page+len, "%sfaulty",sep);
 		sep = ",";
 	}
@@ -2507,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
 		len += sprintf(page+len, "%swrite_mostly",sep);
 		sep = ",";
 	}
-	if (test_bit(Blocked, &rdev->flags)) {
+	if (test_bit(Blocked, &rdev->flags) ||
+	    rdev->badblocks.unacked_exist) {
 		len += sprintf(page+len, "%sblocked", sep);
 		sep = ",";
 	}
@@ -2527,12 +2548,12 @@ static ssize_t
 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
 	/* can write
-	 *  faulty  - simulates and error
+	 *  faulty  - simulates an error
 	 *  remove  - disconnects the device
 	 *  writemostly - sets write_mostly
 	 *  -writemostly - clears write_mostly
-	 *  blocked - sets the Blocked flag
-	 *  -blocked - clears the Blocked flag
+	 *  blocked - sets the Blocked flags
+	 *  -blocked - clears the Blocked and possibly simulates an error
 	 *  insync - sets Insync providing device isn't active
 	 *  write_error - sets WriteErrorSeen
 	 *  -write_error - clears WriteErrorSeen
@@ -2562,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		set_bit(Blocked, &rdev->flags);
 		err = 0;
 	} else if (cmd_match(buf, "-blocked")) {
+		if (!test_bit(Faulty, &rdev->flags) &&
+		    test_bit(BlockedBadBlocks, &rdev->flags)) {
+			/* metadata handler doesn't understand badblocks,
+			 * so we need to fail the device
+			 */
+			md_error(rdev->mddev, rdev);
+		}
 		clear_bit(Blocked, &rdev->flags);
+		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
@@ -2881,7 +2910,11 @@ static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
 }
 static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
 {
-	return badblocks_store(&rdev->badblocks, page, len, 0);
+	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+	/* Maybe that ack was all we needed */
+	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+		wake_up(&rdev->blocked_wait);
+	return rv;
 }
 static struct rdev_sysfs_entry rdev_bad_blocks =
 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
@@ -6398,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!rdev || test_bit(Faulty, &rdev->flags))
 		return;
 
-	if (mddev->external)
-		set_bit(Blocked, &rdev->flags);
-/*
-	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
-		mdname(mddev),
-		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
-		__builtin_return_address(0),__builtin_return_address(1),
-		__builtin_return_address(2),__builtin_return_address(3));
-*/
-	if (!mddev->pers)
-		return;
-	if (!mddev->pers->error_handler)
+	if (!mddev->pers || !mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
@@ -7286,8 +7308,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
-			    !test_bit(Faulty, &rdev->flags) &&
-			    !test_bit(Blocked, &rdev->flags))
+			    !test_bit(Faulty, &rdev->flags))
 				spares++;
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
@@ -7533,7 +7554,8 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
 	sysfs_notify_dirent_safe(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
-			   !test_bit(Blocked, &rdev->flags),
+			   !test_bit(Blocked, &rdev->flags) &&
+			   !test_bit(BlockedBadBlocks, &rdev->flags),
 			   msecs_to_jiffies(5000));
 	rdev_dec_pending(rdev, mddev);
 }
@@ -7779,6 +7801,8 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	}
 
 	bb->changed = 1;
+	if (!acknowledged)
+		bb->unacked_exist = 1;
 	write_sequnlock_irq(&bb->lock);
 
 	return rv;
@@ -7923,6 +7947,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
 				p[i] = BB_MAKE(start, len, 1);
 			}
 		}
+		bb->unacked_exist = 0;
 	}
 	write_sequnlock_irq(&bb->lock);
 }
@@ -7970,6 +7995,8 @@ retry:
 				(unsigned long long)s << bb->shift,
 				length << bb->shift);
 	}
+	if (unack && len == 0)
+		bb->unacked_exist = 0;
 
 	if (read_seqretry(&bb->lock, seq))
 		goto retry;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fa4b607854a..1e586bb4452 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -81,12 +81,29 @@ struct mdk_rdev_s
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
 #define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occurred on an externally
-					 * managed array, don't allow writes
+#define Blocked		8		/* An error occurred but has not yet
+					 * been acknowledged by the metadata
+					 * handler, so don't allow writes
 					 * until it is cleared */
 #define WriteErrorSeen	9		/* A write error has been seen on this
 					 * device
 					 */
+#define FaultRecorded	10		/* Intermediate state for clearing
+					 * Blocked.  The Fault is/will-be
+					 * recorded in the metadata, but that
+					 * metadata hasn't been stored safely
+					 * on disk yet.
+					 */
+#define BlockedBadBlocks 11		/* A writer is blocked because they
+					 * found an unacknowledged bad-block.
+					 * This can safely be cleared at any
+					 * time, and the writer will re-check.
+					 * It may be set at any time, and at
+					 * worst the writer will timeout and
+					 * re-check.  So setting it as
+					 * accurately as possible is good, but
+					 * not absolutely critical.
+					 */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
@@ -124,6 +141,10 @@ struct mdk_rdev_s
 
 	struct badblocks {
 		int	count;		/* count of bad blocks */
+		int	unacked_exist;	/* there probably are unacknowledged
+					 * bad blocks.  This is only cleared
+					 * when a read discovers none
+					 */
 		int	shift;		/* shift from sectors to block size
 					 * a -ve shift means badblocks are
 					 * disabled.*/
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8c31c39b6f8..4d40d9d54a2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1059,6 +1059,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		conf->recovery_disabled = mddev->recovery_disabled;
 		return;
 	}
+	set_bit(Blocked, &rdev->flags);
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
@@ -1751,6 +1752,8 @@ read_more:
 			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
 		}
 		cond_resched();
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8aadd2f52dc..fe6692e6221 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1021,6 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
+	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
@@ -1703,6 +1704,8 @@ static void raid10d(mddev_t *mddev)
 			}
 		}
 		cond_resched();
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 719445004dd..304389ba5e2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1706,6 +1706,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
+	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
@@ -4143,6 +4144,9 @@ static void raid5d(mddev_t *mddev)
 		release_stripe(sh);
 		cond_resched();
 
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
+
 		spin_lock_irq(&conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
-- 
cgit v1.2.3-70-g09d2


From 31c176ecdf3563140e6395249eda51a18130d9f6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:22 +1000
Subject: md/raid5: avoid reading from known bad blocks.

There are two times that we might read in raid5:
1/ when a read request fits within a chunk on a single
   working device.
   In this case, if there is any bad block in the range of
   the read, we simply fail the cache-bypass read and
   perform the read though the stripe cache.

2/ when reading into the stripe cache.  In this case we
   mark as failed any device which has a bad block in that
   strip (1 page wide).
   Note that we will both avoid reading and avoid writing.
   This is correct (as we will never read from the block, there
   is no point writing), but not optimal (as writing could 'fix'
   the error) - that will be addressed later.

If we have not seen any write errors on the device yet, we treat a bad
block like a recent read error.  This will encourage an attempt to fix
the read error which will either generate a write error, or will
ensure good data is stored there.  We don't yet forget the bad block
in that case.  That comes later.

Now that we honour bad blocks when reading we can allow devices with
bad blocks into the array.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 304389ba5e2..a2d68389ee7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2923,6 +2923,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 	spin_lock_irq(&conf->device_lock);
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
+		sector_t first_bad;
+		int bad_sectors;
+		int is_bad = 0;
 
 		dev = &sh->dev[i];
 
@@ -2959,15 +2962,32 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 		if (dev->written)
 			s->written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (s->blocked_rdev == NULL &&
-		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			s->blocked_rdev = rdev;
-			atomic_inc(&rdev->nr_pending);
+		if (rdev) {
+			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+					     &first_bad, &bad_sectors);
+			if (s->blocked_rdev == NULL
+			    && (test_bit(Blocked, &rdev->flags)
+				|| is_bad < 0)) {
+				if (is_bad < 0)
+					set_bit(BlockedBadBlocks,
+						&rdev->flags);
+				s->blocked_rdev = rdev;
+				atomic_inc(&rdev->nr_pending);
+			}
 		}
 		clear_bit(R5_Insync, &dev->flags);
 		if (!rdev)
 			/* Not in-sync */;
-		else if (test_bit(In_sync, &rdev->flags))
+		else if (is_bad) {
+			/* also not in-sync */
+			if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+				/* treat as in-sync, but with a read error
+				 * which we can now try to correct
+				 */
+				set_bit(R5_Insync, &dev->flags);
+				set_bit(R5_ReadError, &dev->flags);
+			}
+		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
 		else {
 			/* in sync if before recovery_offset */
@@ -3471,6 +3491,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
 	rcu_read_lock();
 	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
 	if (rdev && test_bit(In_sync, &rdev->flags)) {
+		sector_t first_bad;
+		int bad_sectors;
+
 		atomic_inc(&rdev->nr_pending);
 		rcu_read_unlock();
 		raid_bio->bi_next = (void*)rdev;
@@ -3478,8 +3501,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
 		align_bi->bi_sector += rdev->data_offset;
 
-		if (!bio_fits_rdev(align_bi)) {
-			/* too big in some way */
+		if (!bio_fits_rdev(align_bi) ||
+		    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+				&first_bad, &bad_sectors)) {
+			/* too big in some way, or has a known bad block */
 			bio_put(align_bi);
 			rdev_dec_pending(rdev, mddev);
 			return 0;
@@ -4671,10 +4696,6 @@ static int run(mddev_t *mddev)
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		if (rdev->badblocks.count) {
-			printk(KERN_ERR "md/raid5: cannot handle bad blocks yet\n");
-			goto abort;
-		}
 		if (rdev->raid_disk < 0)
 			continue;
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -4983,9 +5004,6 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (rdev->badblocks.count)
-		return -EINVAL;
-
 	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 7f0da59bdc2f65795a57009d78f7753d3aea1de3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:22 +1000
Subject: md/raid5: use bad-block log to improve handling of uncorrectable read
 errors.

If we get an uncorrectable read error - record a bad block rather than
failing the device.
And if these errors (which may be due to known bad blocks) cause
recovery to be impossible, record a bad block on the recovering
devices, or abort the recovery.

As we might abort a recovery without failing a device we need to teach
RAID5 about recovery_disabled handling.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/raid5.h |  2 +-
 2 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a2d68389ee7..5fc621673e6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2232,9 +2232,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->disks[i].rdev);
 			if (rdev && test_bit(In_sync, &rdev->flags))
-				/* multiple read failures in one stripe */
-				md_error(conf->mddev, rdev);
+				atomic_inc(&rdev->nr_pending);
+			else
+				rdev = NULL;
 			rcu_read_unlock();
+			if (rdev) {
+				if (!rdev_set_badblocks(
+					    rdev,
+					    sh->sector,
+					    STRIPE_SECTORS, 0))
+					md_error(conf->mddev, rdev);
+				rdev_dec_pending(rdev, conf->mddev);
+			}
 		}
 		spin_lock_irq(&conf->device_lock);
 		/* fail all writes first */
@@ -2313,6 +2322,41 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
+static void
+handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
+		   struct stripe_head_state *s)
+{
+	int abort = 0;
+	int i;
+
+	md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
+	clear_bit(STRIPE_SYNCING, &sh->state);
+	s->syncing = 0;
+	/* There is nothing more to do for sync/check/repair.
+	 * For recover we need to record a bad block on all
+	 * non-sync devices, or abort the recovery
+	 */
+	if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
+		return;
+	/* During recovery devices cannot be removed, so locking and
+	 * refcounting of rdevs is not needed
+	 */
+	for (i = 0; i < conf->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (!rdev
+		    || test_bit(Faulty, &rdev->flags)
+		    || test_bit(In_sync, &rdev->flags))
+			continue;
+		if (!rdev_set_badblocks(rdev, sh->sector,
+					STRIPE_SECTORS, 0))
+			abort = 1;
+	}
+	if (abort) {
+		conf->recovery_disabled = conf->mddev->recovery_disabled;
+		set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
+	}
+}
+
 /* fetch_block - checks the given member device to see if its data needs
  * to be read or computed to satisfy a request.
  *
@@ -3067,11 +3111,8 @@ static void handle_stripe(struct stripe_head *sh)
 	 */
 	if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
 		handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-	if (s.failed > conf->max_degraded && s.syncing) {
-		md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
-		clear_bit(STRIPE_SYNCING, &sh->state);
-		s.syncing = 0;
-	}
+	if (s.failed > conf->max_degraded && s.syncing)
+		handle_failed_sync(conf, sh, &s);
 
 	/*
 	 * might be able to return some write requests if the parity blocks
@@ -4976,6 +5017,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
 		 * isn't possible.
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
+		    mddev->recovery_disabled != conf->recovery_disabled &&
 		    !has_failed(conf) &&
 		    number < conf->raid_disks) {
 			err = -EBUSY;
@@ -5004,6 +5046,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (mddev->recovery_disabled == conf->recovery_disabled)
+		return -EBUSY;
+
 	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 68c500af110..c5429d12363 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -399,7 +399,7 @@ struct raid5_private_data {
 					    * (fresh device added).
 					    * Cleared when a sync completes.
 					    */
-
+	int			recovery_disabled;
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
-- 
cgit v1.2.3-70-g09d2


From bc2607f393bd4fb844c1886a02af929ca0372056 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:22 +1000
Subject: md/raid5: write errors should be recorded as bad blocks if possible.

When a write error is detected, don't mark the device as failed
immediately but rather record the fact for handle_stripe to deal with.

Handle_stripe then attempts to record a bad block.  Only if that fails
does the device get marked as faulty.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 33 +++++++++++++++++++++++++++++++--
 drivers/md/raid5.h | 18 ++++++++++--------
 2 files changed, 41 insertions(+), 10 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5fc621673e6..9768a7d6714 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1658,8 +1658,10 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		return;
 	}
 
-	if (!uptodate)
-		md_error(conf->mddev, conf->disks[i].rdev);
+	if (!uptodate) {
+		set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
+		set_bit(R5_WriteError, &sh->dev[i].flags);
+	}
 
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 	
@@ -3038,6 +3040,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
 				set_bit(R5_Insync, &dev->flags);
 		}
+		if (test_bit(R5_WriteError, &dev->flags)) {
+			clear_bit(R5_Insync, &dev->flags);
+			if (!test_bit(Faulty, &rdev->flags)) {
+				s->handle_bad_blocks = 1;
+				atomic_inc(&rdev->nr_pending);
+			} else
+				clear_bit(R5_WriteError, &dev->flags);
+		}
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -3086,6 +3096,11 @@ static void handle_stripe(struct stripe_head *sh)
 
 	analyse_stripe(sh, &s);
 
+	if (s.handle_bad_blocks) {
+		set_bit(STRIPE_HANDLE, &sh->state);
+		goto finish;
+	}
+
 	if (unlikely(s.blocked_rdev)) {
 		if (s.syncing || s.expanding || s.expanded ||
 		    s.to_write || s.written) {
@@ -3283,6 +3298,20 @@ finish:
 	if (unlikely(s.blocked_rdev))
 		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
 
+	if (s.handle_bad_blocks)
+		for (i = disks; i--; ) {
+			mdk_rdev_t *rdev;
+			struct r5dev *dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
+				/* We own a safe reference to the rdev */
+				rdev = conf->disks[i].rdev;
+				if (!rdev_set_badblocks(rdev, sh->sector,
+							STRIPE_SECTORS, 0))
+					md_error(conf->mddev, rdev);
+				rdev_dec_pending(rdev, conf->mddev);
+			}
+		}
+
 	if (s.ops_request)
 		raid_run_ops(sh, s.ops_request);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c5429d12363..8620cb67ae3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -249,6 +249,7 @@ struct stripe_head_state {
 
 	struct bio *return_bi;
 	mdk_rdev_t *blocked_rdev;
+	int handle_bad_blocks;
 };
 
 /* Flags */
@@ -264,14 +265,15 @@ struct stripe_head_state {
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
 #define	R5_Expanded	10	/* This block now has post-expand data */
-#define	R5_Wantcompute	11 /* compute_block in progress treat as
-				    * uptodate
-				    */
-#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
-				    * filling
-				    */
-#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
-#define R5_WantFUA	14	/* Write should be FUA */
+#define	R5_Wantcompute	11	/* compute_block in progress treat as
+				 * uptodate
+				 */
+#define	R5_Wantfill	12	/* dev->toread contains a bio that needs
+				 * filling
+				 */
+#define	R5_Wantdrain	13	/* dev->towrite needs to be drained */
+#define	R5_WantFUA	14	/* Write should be FUA */
+#define	R5_WriteError	15	/* got a write error - need to record it */
 /*
  * Write method
  */
-- 
cgit v1.2.3-70-g09d2


From 73e92e51b7969ef5477dd28fe2ae4d77675896f4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:22 +1000
Subject: md/raid5.  Don't write to known bad block on doubtful devices.

If a device has seen write errors, don't write to any known
bad blocks on that device.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9768a7d6714..3fa3f20dc5f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -526,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			atomic_inc(&rdev->nr_pending);
 		rcu_read_unlock();
 
+		/* We have already checked bad blocks for reads.  Now
+		 * need to check for writes.
+		 */
+		while ((rw & WRITE) && rdev &&
+		       test_bit(WriteErrorSeen, &rdev->flags)) {
+			sector_t first_bad;
+			int bad_sectors;
+			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+					      &first_bad, &bad_sectors);
+			if (!bad)
+				break;
+
+			if (bad < 0) {
+				set_bit(BlockedBadBlocks, &rdev->flags);
+				if (!conf->mddev->external &&
+				    conf->mddev->flags) {
+					/* It is very unlikely, but we might
+					 * still need to write out the
+					 * bad block log - better give it
+					 * a chance*/
+					md_check_recovery(conf->mddev);
+				}
+				md_wait_for_blocked_rdev(rdev, conf->mddev);
+			} else {
+				/* Acknowledged bad block - skip the write */
+				rdev_dec_pending(rdev, conf->mddev);
+				rdev = NULL;
+			}
+		}
+
 		if (rdev) {
 			if (s->syncing || s->expanding || s->expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
@@ -3317,7 +3347,6 @@ finish:
 
 	ops_run_io(sh, &s);
 
-
 	if (s.dec_preread_active) {
 		/* We delay this until after ops_run_io so that if make_request
 		 * is waiting on a flush, it won't continue until the writes
-- 
cgit v1.2.3-70-g09d2


From b84db560ead5417b5594349512baf8837959df4f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:39:23 +1000
Subject: md/raid5: Clear bad blocks on successful write.

On a successful write to a known bad block, flag the sh
so that raid5d can remove the known bad block from the list.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 19 ++++++++++++++++++-
 drivers/md/raid5.h |  1 +
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3fa3f20dc5f..dbae459fb02 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1675,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+	sector_t first_bad;
+	int bad_sectors;
 
 	for (i=0 ; i<disks; i++)
 		if (bi == &sh->dev[i].req)
@@ -1691,7 +1693,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	if (!uptodate) {
 		set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
 		set_bit(R5_WriteError, &sh->dev[i].flags);
-	}
+	} else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
+			       &first_bad, &bad_sectors))
+		set_bit(R5_MadeGood, &sh->dev[i].flags);
 
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 	
@@ -3078,6 +3082,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			} else
 				clear_bit(R5_WriteError, &dev->flags);
 		}
+		if (test_bit(R5_MadeGood, &dev->flags)) {
+			if (!test_bit(Faulty, &rdev->flags)) {
+				s->handle_bad_blocks = 1;
+				atomic_inc(&rdev->nr_pending);
+			} else
+				clear_bit(R5_MadeGood, &dev->flags);
+		}
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -3340,6 +3351,12 @@ finish:
 					md_error(conf->mddev, rdev);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
+			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
+				rdev = conf->disks[i].rdev;
+				rdev_clear_badblocks(rdev, sh->sector,
+						     STRIPE_SECTORS);
+				rdev_dec_pending(rdev, conf->mddev);
+			}
 		}
 
 	if (s.ops_request)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8620cb67ae3..11b9566184b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -274,6 +274,7 @@ struct stripe_head_state {
 #define	R5_Wantdrain	13	/* dev->towrite needs to be drained */
 #define	R5_WantFUA	14	/* Write should be FUA */
 #define	R5_WriteError	15	/* got a write error - need to record it */
+#define	R5_MadeGood	16	/* A bad block has been fixed by writing to it*/
 /*
  * Write method
  */
-- 
cgit v1.2.3-70-g09d2