From 63fe08177f92ce21929df8af6361491b98ad12cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 14 Apr 2009 12:01:53 +1000
Subject: md: tiny md.h cleanups

- update inclusion guard and make sure it covers the whole file
 - remove superflous #ifdef CONFIG_BLOCK
 - make sure all required headers are included so that new users aren't
   required to include others before

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.h | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.h b/drivers/md/md.h
index e9b7f54c24d..8227ab909d4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -12,10 +12,17 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#ifndef _MD_K_H
-#define _MD_K_H
-
-#ifdef CONFIG_BLOCK
+#ifndef _MD_MD_H
+#define _MD_MD_H
+
+#include <linux/blkdev.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
 
 #define MaxSector (~(sector_t)0)
 
@@ -408,10 +415,6 @@ static inline void safe_put_page(struct page *p)
 	if (p) put_page(p);
 }
 
-#endif /* CONFIG_BLOCK */
-#endif
-
-
 extern int register_md_personality(struct mdk_personality *p);
 extern int unregister_md_personality(struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
@@ -434,3 +437,5 @@ extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+
+#endif /* _MD_MD_H */
-- 
cgit v1.2.3-70-g09d2


From 6d56e278444bc8323b1bcfcada126b8e4dbba0f4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 14 Apr 2009 12:01:57 +1000
Subject: md: allow setting newly added device to 'in_sync' via sysfs.

When adding devices to an active array via sysfs, there is currently
no way to mark a device as 'in-sync' which is useful when
incrementally assembling an array.

So add that option.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ed5727c089a..298731b8e95 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2086,6 +2086,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	 *  -writemostly - clears write_mostly
 	 *  blocked - sets the Blocked flag
 	 *  -blocked - clears the Blocked flag
+	 *  insync - sets Insync providing device isn't active
 	 */
 	int err = -EINVAL;
 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2117,6 +2118,9 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
 
+		err = 0;
+	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
+		set_bit(In_sync, &rdev->flags);
 		err = 0;
 	}
 	if (!err && rdev->sysfs_state)
@@ -2190,7 +2194,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	} else if (rdev->mddev->pers) {
 		mdk_rdev_t *rdev2;
 		/* Activating a spare .. or possibly reactivating
-		 * if we every get bitmaps working here.
+		 * if we ever get bitmaps working here.
 		 */
 
 		if (rdev->raid_disk != -1)
-- 
cgit v1.2.3-70-g09d2


From acb180b0e335ad88acfed6c8a33e39c05b95dc49 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 14 Apr 2009 16:28:34 +1000
Subject: md: improve usefulness and accuracy of sysfs file md/sync_completed.

The sync_completed file reports how much of a resync (or recovery or
reshape) has been completed.
However due to the possibility of out-of-order completion of writes,
it is not certain to be accurate.

We have an internal value - mddev->curr_resync_completed - which is an
accurate value (though it might not always be quite so uptodate).

So:
 - make curr_resync_completed be uptodate a little more often,
   particularly when raid5 reshape updates status in the metadata
 - report curr_resync_completed in the sysfs file
 - allow poll/select to report all updates to md/sync_completed.

This makes sync_completed completed usable by any external metadata
handler that wants to record this status information in its metadata.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c |  1 +
 drivers/md/md.c     | 34 ++++++++++++++++++++++------------
 drivers/md/raid5.c  |  4 ++++
 3 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8a9f7ab2cb..e4510c976b3 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1479,6 +1479,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 		s += blocks;
 	}
 	bitmap->last_end_sync = jiffies;
+	sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
 }
 
 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 298731b8e95..7af64f3846a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2017,6 +2017,8 @@ repeat:
 	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
 	spin_unlock_irq(&mddev->write_lock);
 	wake_up(&mddev->sb_wait);
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
 }
 
@@ -3486,12 +3488,15 @@ sync_completed_show(mddev_t *mddev, char *page)
 {
 	unsigned long max_sectors, resync;
 
+	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return sprintf(page, "none\n");
+
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		max_sectors = mddev->resync_max_sectors;
 	else
 		max_sectors = mddev->dev_sectors;
 
-	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	resync = mddev->curr_resync_completed;
 	return sprintf(page, "%lu / %lu\n", resync, max_sectors);
 }
 
@@ -6338,18 +6343,12 @@ void md_do_sync(mddev_t *mddev)
 		sector_t sectors;
 
 		skipped = 0;
-		if (j >= mddev->resync_max) {
-			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
-			wait_event(mddev->recovery_wait,
-				   mddev->resync_max > j
-				   || kthread_should_stop());
-		}
-		if (kthread_should_stop())
-			goto interrupted;
 
-		if (mddev->curr_resync > mddev->curr_resync_completed &&
-		    (mddev->curr_resync - mddev->curr_resync_completed)
-		    > (max_sectors >> 4)) {
+		if ((mddev->curr_resync > mddev->curr_resync_completed &&
+		     (mddev->curr_resync - mddev->curr_resync_completed)
+		    > (max_sectors >> 4)) ||
+		    j >= mddev->resync_max
+			) {
 			/* time to update curr_resync_completed */
 			blk_unplug(mddev->queue);
 			wait_event(mddev->recovery_wait,
@@ -6357,7 +6356,17 @@ void md_do_sync(mddev_t *mddev)
 			mddev->curr_resync_completed =
 				mddev->curr_resync;
 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 		}
+
+		if (j >= mddev->resync_max)
+			wait_event(mddev->recovery_wait,
+				   mddev->resync_max > j
+				   || kthread_should_stop());
+
+		if (kthread_should_stop())
+			goto interrupted;
+
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
 						  currspeed < speed_min(mddev));
 		if (sectors == 0) {
@@ -6465,6 +6474,7 @@ void md_do_sync(mddev_t *mddev)
 
  skip:
 	mddev->curr_resync = 0;
+	mddev->curr_resync_completed = 0;
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3bbc6d64704..76892ac7254 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3845,6 +3845,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes)==0);
 		mddev->reshape_position = conf->reshape_progress;
+		mddev->curr_resync_completed = mddev->curr_resync;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
@@ -3854,6 +3855,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		conf->reshape_safe = mddev->reshape_position;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
+		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
 
 	if (mddev->delta_disks < 0) {
@@ -3943,6 +3945,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes) == 0);
 		mddev->reshape_position = conf->reshape_progress;
+		mddev->curr_resync_completed = mddev->curr_resync;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
@@ -3953,6 +3956,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		conf->reshape_safe = mddev->reshape_position;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
+		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
 	return reshape_sectors;
 }
-- 
cgit v1.2.3-70-g09d2


From c03f6a19699fce4389888a45db8245969311d868 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 17 Apr 2009 11:06:30 +1000
Subject: md: update sync_completed and reshape_position even more often.

There are circumstances when a user-space process might need to
"oversee" a resync/reshape process.  For example when doing an
in-place reshape of a raid5, it is prudent to take a backup of each
section before reshaping it as this is the only way to provide
safety against an unplanned shutdown (i.e. crash/power failure).

The sync_max sysfs value can be used to stop the resync from
advancing beyond a particular point.
So user-space can:
  suspend IO to the first section and back it up
  set 'sync_max' to the end of the section
  wait for 'sync_completed' to reach that point
  resume IO on the first section and move on to the next section.

However this process requires the kernel and user-space to run in
lock-step which could introduce unnecessary delays.

It would be better if a 'double buffered' approach could be used with
userspace and kernel space working on different sections with the
'next' section always ready when the 'current' section is finished.

One problem with implementing this is that sync_completed is only
guaranteed to be updated when the sync process reaches sync_max.
(it is updated on a time basis at other times, but it is hard to rely
on that).  This defeats some of the double buffering.

With this patch, sync_completed (and reshape_position) get updated as
the current position approaches sync_max, so there is room for
userspace to advance sync_max early without losing updates.

To be precise, sync_completed is updated when the current sync
position reaches half way between the current value of sync_completed
and the value of sync_max.  This will usually be a good time for user
space to update sync_max.

If sync_max does not get updated, the updates to sync_completed
(together with associated metadata updates) will occur at an
exponentially increasing frequency which will get unreasonably fast
(one update every page) immediately before the process hits sync_max
and stops.  So the update rate will be unreasonably fast only for an
insignificant period of time.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 3 ++-
 drivers/md/raid5.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7af64f3846a..612343fdde9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6347,7 +6347,8 @@ void md_do_sync(mddev_t *mddev)
 		if ((mddev->curr_resync > mddev->curr_resync_completed &&
 		     (mddev->curr_resync - mddev->curr_resync_completed)
 		    > (max_sectors >> 4)) ||
-		    j >= mddev->resync_max
+		    (j - mddev->curr_resync_completed)*2
+		    >= mddev->resync_max - mddev->curr_resync_completed
 			) {
 			/* time to update curr_resync_completed */
 			blk_unplug(mddev->queue);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 76892ac7254..4616bc3a6e7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3940,7 +3940,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * then we need to write out the superblock.
 	 */
 	sector_nr += reshape_sectors;
-	if (sector_nr >= mddev->resync_max) {
+	if ((sector_nr - mddev->curr_resync_completed) * 2
+	    >= mddev->resync_max - mddev->curr_resync_completed) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes) == 0);
-- 
cgit v1.2.3-70-g09d2


From 1f59390339f263c0fe7908fbe54466dbf3a64b58 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 20 Apr 2009 11:50:24 +1000
Subject: md: support bitmaps on RAID10 arrays larger then 2 terabytes

.. and other arrays with components larger than 2 terabytes.

We use a "long" rather than a "sector_t" in part of the bitmap
size calculations, which is sad.

Reported-by: "Mario 'BitKoenig' Holbe" <Mario.Holbe@TU-Ilmenau.DE>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e4510c976b3..1fb91edc7de 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1590,7 +1590,7 @@ void bitmap_destroy(mddev_t *mddev)
 int bitmap_create(mddev_t *mddev)
 {
 	struct bitmap *bitmap;
-	unsigned long blocks = mddev->resync_max_sectors;
+	sector_t blocks = mddev->resync_max_sectors;
 	unsigned long chunks;
 	unsigned long pages;
 	struct file *file = mddev->bitmap_file;
@@ -1632,8 +1632,8 @@ int bitmap_create(mddev_t *mddev)
 	bitmap->chunkshift = ffz(~bitmap->chunksize);
 
 	/* now that chunksize and chunkshift are set, we can use these macros */
- 	chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
-			CHUNK_BLOCK_RATIO(bitmap);
+ 	chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
+			CHUNK_BLOCK_SHIFT(bitmap);
  	pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
 
 	BUG_ON(!pages);
-- 
cgit v1.2.3-70-g09d2