summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-07 08:19:51 +0200
committerIngo Molnar <mingo@elte.hu>2009-09-07 08:19:51 +0200
commita1922ed661ab2c1637d0b10cde933bd9cd33d965 (patch)
tree0f1777542b385ebefd30b3586d830fd8ed6fda5b /drivers/md/raid5.c
parent75e33751ca8bbb72dd6f1a74d2810ddc8cbe4bdf (diff)
parentd28daf923ac5e4a0d7cecebae56f3e339189366b (diff)
Merge branch 'tracing/core' into tracing/hw-breakpoints
Conflicts: arch/Kconfig kernel/trace/trace.h Merge reason: resolve the conflicts, plus adopt to the new ring-buffer APIs. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c331
1 files changed, 189 insertions, 142 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bef87669823..b8a2c5dc67b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1274,8 +1274,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
sector_t new_sector;
int algorithm = previous ? conf->prev_algo
: conf->algorithm;
- int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
- : (conf->chunk_size >> 9);
+ int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+ : conf->chunk_sectors;
int raid_disks = previous ? conf->previous_raid_disks
: conf->raid_disks;
int data_disks = raid_disks - conf->max_degraded;
@@ -1480,8 +1480,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
int raid_disks = sh->disks;
int data_disks = raid_disks - conf->max_degraded;
sector_t new_sector = sh->sector, check;
- int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
- : (conf->chunk_size >> 9);
+ int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+ : conf->chunk_sectors;
int algorithm = previous ? conf->prev_algo
: conf->algorithm;
sector_t stripe;
@@ -1997,8 +1997,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
struct stripe_head *sh)
{
int sectors_per_chunk =
- previous ? (conf->prev_chunk >> 9)
- : (conf->chunk_size >> 9);
+ previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
int dd_idx;
int chunk_offset = sector_div(stripe, sectors_per_chunk);
int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
@@ -3284,7 +3283,7 @@ static void activate_bit_delay(raid5_conf_t *conf)
static void unplug_slaves(mddev_t *mddev)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
int i;
rcu_read_lock();
@@ -3308,7 +3307,7 @@ static void unplug_slaves(mddev_t *mddev)
static void raid5_unplug_device(struct request_queue *q)
{
mddev_t *mddev = q->queuedata;
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
@@ -3327,7 +3326,7 @@ static void raid5_unplug_device(struct request_queue *q)
static int raid5_congested(void *data, int bits)
{
mddev_t *mddev = data;
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
/* No difference between reads and writes. Just check
* how busy the stripe_cache is
@@ -3352,14 +3351,14 @@ static int raid5_mergeable_bvec(struct request_queue *q,
mddev_t *mddev = q->queuedata;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
- unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
if ((bvm->bi_rw & 1) == WRITE)
return biovec->bv_len; /* always allow writes to be mergeable */
- if (mddev->new_chunk < mddev->chunk_size)
- chunk_sectors = mddev->new_chunk >> 9;
+ if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+ chunk_sectors = mddev->new_chunk_sectors;
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
if (max < 0) max = 0;
if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3372,11 +3371,11 @@ static int raid5_mergeable_bvec(struct request_queue *q,
static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
{
sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
- unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bio->bi_size >> 9;
- if (mddev->new_chunk < mddev->chunk_size)
- chunk_sectors = mddev->new_chunk >> 9;
+ if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+ chunk_sectors = mddev->new_chunk_sectors;
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
}
@@ -3440,7 +3439,7 @@ static void raid5_align_endio(struct bio *bi, int error)
bio_put(bi);
mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
- conf = mddev_to_conf(mddev);
+ conf = mddev->private;
rdev = (void*)raid_bi->bi_next;
raid_bi->bi_next = NULL;
@@ -3482,7 +3481,7 @@ static int bio_fits_rdev(struct bio *bi)
static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
{
mddev_t *mddev = q->queuedata;
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
unsigned int dd_idx;
struct bio* align_bi;
mdk_rdev_t *rdev;
@@ -3599,7 +3598,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
static int make_request(struct request_queue *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
int dd_idx;
sector_t new_sector;
sector_t logical_sector, last_sector;
@@ -3696,16 +3695,25 @@ static int make_request(struct request_queue *q, struct bio * bi)
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
release_stripe(sh);
+ schedule();
goto retry;
}
}
- /* FIXME what if we get a false positive because these
- * are being updated.
- */
- if (logical_sector >= mddev->suspend_lo &&
+
+ if (bio_data_dir(bi) == WRITE &&
+ logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
release_stripe(sh);
- schedule();
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_for_overlap,
+ &w, TASK_INTERRUPTIBLE);
+ if (logical_sector >= mddev->suspend_lo &&
+ logical_sector < mddev->suspend_hi)
+ schedule();
goto retry;
}
@@ -3777,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
- } else if (mddev->delta_disks > 0 &&
+ } else if (mddev->delta_disks >= 0 &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
@@ -3791,10 +3799,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
- if (mddev->new_chunk > mddev->chunk_size)
- reshape_sectors = mddev->new_chunk / 512;
+ if (mddev->new_chunk_sectors > mddev->chunk_sectors)
+ reshape_sectors = mddev->new_chunk_sectors;
else
- reshape_sectors = mddev->chunk_size / 512;
+ reshape_sectors = mddev->chunk_sectors;
/* we update the metadata when there is more than 3Meg
* in the block range (that is rather arbitrary, should
@@ -3917,7 +3925,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
1, &dd_idx, NULL);
last_sector =
raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
- *(new_data_disks) - 1),
+ * new_data_disks - 1),
1, &dd_idx, NULL);
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
@@ -3946,7 +3954,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -3991,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return 0;
}
+ /* Allow raid5_quiesce to complete */
+ wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -4129,7 +4140,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
static void raid5d(mddev_t *mddev)
{
struct stripe_head *sh;
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
int handled;
pr_debug("+++ raid5d active\n");
@@ -4185,7 +4196,7 @@ static void raid5d(mddev_t *mddev)
static ssize_t
raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
if (conf)
return sprintf(page, "%d\n", conf->max_nr_stripes);
else
@@ -4195,7 +4206,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
static ssize_t
raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
unsigned long new;
int err;
@@ -4233,7 +4244,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
static ssize_t
raid5_show_preread_threshold(mddev_t *mddev, char *page)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
if (conf)
return sprintf(page, "%d\n", conf->bypass_threshold);
else
@@ -4243,7 +4254,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page)
static ssize_t
raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
unsigned long new;
if (len >= PAGE_SIZE)
return -EINVAL;
@@ -4267,7 +4278,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
static ssize_t
stripe_cache_active_show(mddev_t *mddev, char *page)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
if (conf)
return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
else
@@ -4291,7 +4302,7 @@ static struct attribute_group raid5_attrs_group = {
static sector_t
raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
if (!sectors)
sectors = mddev->dev_sectors;
@@ -4303,11 +4314,20 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
raid_disks = conf->previous_raid_disks;
}
- sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
- sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+ sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+ sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
return sectors * (raid_disks - conf->max_degraded);
}
+static void free_conf(raid5_conf_t *conf)
+{
+ shrink_stripes(conf);
+ safe_put_page(conf->spare_page);
+ kfree(conf->disks);
+ kfree(conf->stripe_hashtbl);
+ kfree(conf);
+}
+
static raid5_conf_t *setup_conf(mddev_t *mddev)
{
raid5_conf_t *conf;
@@ -4336,9 +4356,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
return ERR_PTR(-EINVAL);
}
- if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+ if (!mddev->new_chunk_sectors ||
+ (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
+ !is_power_of_2(mddev->new_chunk_sectors)) {
printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
- mddev->new_chunk, mdname(mddev));
+ mddev->new_chunk_sectors << 9, mdname(mddev));
return ERR_PTR(-EINVAL);
}
@@ -4401,7 +4423,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
conf->fullsync = 1;
}
- conf->chunk_size = mddev->new_chunk;
+ conf->chunk_sectors = mddev->new_chunk_sectors;
conf->level = mddev->new_level;
if (conf->level == 6)
conf->max_degraded = 2;
@@ -4411,7 +4433,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
conf->max_nr_stripes = NR_STRIPES;
conf->reshape_progress = mddev->reshape_position;
if (conf->reshape_progress != MaxSector) {
- conf->prev_chunk = mddev->chunk_size;
+ conf->prev_chunk_sectors = mddev->chunk_sectors;
conf->prev_algo = mddev->layout;
}
@@ -4437,11 +4459,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
abort:
if (conf) {
- shrink_stripes(conf);
- safe_put_page(conf->spare_page);
- kfree(conf->disks);
- kfree(conf->stripe_hashtbl);
- kfree(conf);
+ free_conf(conf);
return ERR_PTR(-EIO);
} else
return ERR_PTR(-ENOMEM);
@@ -4450,9 +4468,13 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
static int run(mddev_t *mddev)
{
raid5_conf_t *conf;
- int working_disks = 0;
+ int working_disks = 0, chunk_size;
mdk_rdev_t *rdev;
+ if (mddev->recovery_cp != MaxSector)
+ printk(KERN_NOTICE "raid5: %s is not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
* Currently only disks can change, it must
@@ -4475,7 +4497,7 @@ static int run(mddev_t *mddev)
* geometry.
*/
here_new = mddev->reshape_position;
- if (sector_div(here_new, (mddev->new_chunk>>9)*
+ if (sector_div(here_new, mddev->new_chunk_sectors *
(mddev->raid_disks - max_degraded))) {
printk(KERN_ERR "raid5: reshape_position not "
"on a stripe boundary\n");
@@ -4483,11 +4505,30 @@ static int run(mddev_t *mddev)
}
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
- sector_div(here_old, (mddev->chunk_size>>9)*
+ sector_div(here_old, mddev->chunk_sectors *
(old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
- if (here_new >= here_old) {
+ if (mddev->delta_disks == 0) {
+ /* We cannot be sure it is safe to start an in-place
+ * reshape. It is only safe if user-space if monitoring
+ * and taking constant backups.
+ * mdadm always starts a situation like this in
+ * readonly mode so it can take control before
+ * allowing any writes. So just check for that.
+ */
+ if ((here_new * mddev->new_chunk_sectors !=
+ here_old * mddev->chunk_sectors) ||
+ mddev->ro == 0) {
+ printk(KERN_ERR "raid5: in-place reshape must be started"
+ " in read-only mode - aborting\n");
+ return -EINVAL;
+ }
+ } else if (mddev->delta_disks < 0
+ ? (here_new * mddev->new_chunk_sectors <=
+ here_old * mddev->chunk_sectors)
+ : (here_new * mddev->new_chunk_sectors >=
+ here_old * mddev->chunk_sectors)) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "raid5: reshape_position too early for "
"auto-recovery - aborting.\n");
@@ -4498,7 +4539,7 @@ static int run(mddev_t *mddev)
} else {
BUG_ON(mddev->level != mddev->new_level);
BUG_ON(mddev->layout != mddev->new_layout);
- BUG_ON(mddev->chunk_size != mddev->new_chunk);
+ BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
BUG_ON(mddev->delta_disks != 0);
}
@@ -4532,7 +4573,7 @@ static int run(mddev_t *mddev)
}
/* device size must be a multiple of chunk size */
- mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+ mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
mddev->resync_max_sectors = mddev->dev_sectors;
if (mddev->degraded > 0 &&
@@ -4581,7 +4622,7 @@ static int run(mddev_t *mddev)
{
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
- (mddev->chunk_size / PAGE_SIZE);
+ ((mddev->chunk_sectors << 9) / PAGE_SIZE);
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
}
@@ -4601,18 +4642,22 @@ static int run(mddev_t *mddev)
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+ chunk_size = mddev->chunk_sectors << 9;
+ blk_queue_io_min(mddev->queue, chunk_size);
+ blk_queue_io_opt(mddev->queue, chunk_size *
+ (conf->raid_disks - conf->max_degraded));
+
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
return 0;
abort:
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
if (conf) {
- shrink_stripes(conf);
print_raid5_conf(conf);
- safe_put_page(conf->spare_page);
- kfree(conf->disks);
- kfree(conf->stripe_hashtbl);
- kfree(conf);
+ free_conf(conf);
}
mddev->private = NULL;
printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4627,13 +4672,10 @@ static int stop(mddev_t *mddev)
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
- shrink_stripes(conf);
- kfree(conf->stripe_hashtbl);
mddev->queue->backing_dev_info.congested_fn = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
- kfree(conf->disks);
- kfree(conf);
+ free_conf(conf);
mddev->private = NULL;
return 0;
}
@@ -4678,7 +4720,8 @@ static void status(struct seq_file *seq, mddev_t *mddev)
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
int i;
- seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
+ seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
+ mddev->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
@@ -4826,7 +4869,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
- sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+ sectors &= ~((sector_t)mddev->chunk_sectors - 1);
md_set_array_sectors(mddev, raid5_size(mddev, sectors,
mddev->raid_disks));
if (mddev->array_sectors >
@@ -4834,6 +4877,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -4843,14 +4887,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return 0;
}
-static int raid5_check_reshape(mddev_t *mddev)
+static int check_stripe_cache(mddev_t *mddev)
+{
+ /* Can only proceed if there are plenty of stripe_heads.
+ * We need a minimum of one full stripe,, and for sensible progress
+ * it is best to have about 4 times that.
+ * If we require 4 times, then the default 256 4K stripe_heads will
+ * allow for chunk sizes up to 256K, which is probably OK.
+ * If the chunk size is greater, user-space should request more
+ * stripe_heads first.
+ */
+ raid5_conf_t *conf = mddev->private;
+ if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+ > conf->max_nr_stripes ||
+ ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+ > conf->max_nr_stripes) {
+ printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
+ ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+ / STRIPE_SIZE)*4);
+ return 0;
+ }
+ return 1;
+}
+
+static int check_reshape(mddev_t *mddev)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
- mddev->new_chunk == mddev->chunk_size)
- return -EINVAL; /* nothing to do */
+ mddev->new_chunk_sectors == mddev->chunk_sectors)
+ return 0; /* nothing to do */
if (mddev->bitmap)
/* Cannot grow a bitmap yet */
return -EBUSY;
@@ -4869,28 +4936,15 @@ static int raid5_check_reshape(mddev_t *mddev)
return -EINVAL;
}
- /* Can only proceed if there are plenty of stripe_heads.
- * We need a minimum of one full stripe,, and for sensible progress
- * it is best to have about 4 times that.
- * If we require 4 times, then the default 256 4K stripe_heads will
- * allow for chunk sizes up to 256K, which is probably OK.
- * If the chunk size is greater, user-space should request more
- * stripe_heads first.
- */
- if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
- (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
- printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
- (max(mddev->chunk_size, mddev->new_chunk)
- / STRIPE_SIZE)*4);
+ if (!check_stripe_cache(mddev))
return -ENOSPC;
- }
return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
}
static int raid5_start_reshape(mddev_t *mddev)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
mdk_rdev_t *rdev;
int spares = 0;
int added_devices = 0;
@@ -4899,6 +4953,9 @@ static int raid5_start_reshape(mddev_t *mddev)
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
+ if (!check_stripe_cache(mddev))
+ return -ENOSPC;
+
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags))
@@ -4925,8 +4982,8 @@ static int raid5_start_reshape(mddev_t *mddev)
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
conf->raid_disks += mddev->delta_disks;
- conf->prev_chunk = conf->chunk_size;
- conf->chunk_size = mddev->new_chunk;
+ conf->prev_chunk_sectors = conf->chunk_sectors;
+ conf->chunk_sectors = mddev->new_chunk_sectors;
conf->prev_algo = conf->algorithm;
conf->algorithm = mddev->new_layout;
if (mddev->delta_disks < 0)
@@ -4966,7 +5023,7 @@ static int raid5_start_reshape(mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks;
- mddev->reshape_position = 0;
+ mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5008,7 +5065,7 @@ static void end_reshape(raid5_conf_t *conf)
*/
{
int data_disks = conf->raid_disks - conf->max_degraded;
- int stripe = data_disks * (conf->chunk_size
+ int stripe = data_disks * ((conf->chunk_sectors << 9)
/ PAGE_SIZE);
if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -5021,8 +5078,7 @@ static void end_reshape(raid5_conf_t *conf)
*/
static void raid5_finish_reshape(mddev_t *mddev)
{
- struct block_device *bdev;
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5030,15 +5086,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
+ revalidate_disk(mddev->gendisk);
} else {
int d;
mddev->degraded = conf->raid_disks;
@@ -5049,11 +5097,18 @@ static void raid5_finish_reshape(mddev_t *mddev)
mddev->degraded--;
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
- d++)
- raid5_remove_disk(mddev, d);
+ d++) {
+ mdk_rdev_t *rdev = conf->disks[d].rdev;
+ if (rdev && raid5_remove_disk(mddev, d) == 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
}
mddev->layout = conf->algorithm;
- mddev->chunk_size = conf->chunk_size;
+ mddev->chunk_sectors = conf->chunk_sectors;
mddev->reshape_position = MaxSector;
mddev->delta_disks = 0;
}
@@ -5061,7 +5116,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
static void raid5_quiesce(mddev_t *mddev, int state)
{
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
switch(state) {
case 2: /* resume for a suspend */
@@ -5070,12 +5125,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
case 1: /* stop all writes */
spin_lock_irq(&conf->device_lock);
- conf->quiesce = 1;
+ /* '2' tells resync/reshape to pause so that all
+ * active stripes can drain
+ */
+ conf->quiesce = 2;
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
+ conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
+ /* allow reshape to continue */
+ wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */
@@ -5111,7 +5172,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev)
mddev->new_level = 5;
mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
- mddev->new_chunk = chunksect << 9;
+ mddev->new_chunk_sectors = chunksect;
return setup_conf(mddev);
}
@@ -5150,24 +5211,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev)
}
-static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid5_check_reshape(mddev_t *mddev)
{
/* For a 2-drive array, the layout and chunk size can be changed
* immediately as not restriping is needed.
* For larger arrays we record the new value - after validation
* to be used by a reshape pass.
*/
- raid5_conf_t *conf = mddev_to_conf(mddev);
+ raid5_conf_t *conf = mddev->private;
+ int new_chunk = mddev->new_chunk_sectors;
- if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+ if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
return -EINVAL;
if (new_chunk > 0) {
- if (new_chunk & (new_chunk-1))
- /* not a power of 2 */
+ if (!is_power_of_2(new_chunk))
return -EINVAL;
- if (new_chunk < PAGE_SIZE)
+ if (new_chunk < (PAGE_SIZE>>9))
return -EINVAL;
- if (mddev->array_sectors & ((new_chunk>>9)-1))
+ if (mddev->array_sectors & (new_chunk-1))
/* not factor of array size */
return -EINVAL;
}
@@ -5175,49 +5236,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
/* They look valid */
if (mddev->raid_disks == 2) {
-
- if (new_layout >= 0) {
- conf->algorithm = new_layout;
- mddev->layout = mddev->new_layout = new_layout;
+ /* can make the change immediately */
+ if (mddev->new_layout >= 0) {
+ conf->algorithm = mddev->new_layout;
+ mddev->layout = mddev->new_layout;
}
if (new_chunk > 0) {
- conf->chunk_size = new_chunk;
- mddev->chunk_size = mddev->new_chunk = new_chunk;
+ conf->chunk_sectors = new_chunk ;
+ mddev->chunk_sectors = new_chunk;
}
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
- } else {
- if (new_layout >= 0)
- mddev->new_layout = new_layout;
- if (new_chunk > 0)
- mddev->new_chunk = new_chunk;
}
- return 0;
+ return check_reshape(mddev);
}
-static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid6_check_reshape(mddev_t *mddev)
{
- if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+ int new_chunk = mddev->new_chunk_sectors;
+
+ if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
return -EINVAL;
if (new_chunk > 0) {
- if (new_chunk & (new_chunk-1))
- /* not a power of 2 */
+ if (!is_power_of_2(new_chunk))
return -EINVAL;
- if (new_chunk < PAGE_SIZE)
+ if (new_chunk < (PAGE_SIZE >> 9))
return -EINVAL;
- if (mddev->array_sectors & ((new_chunk>>9)-1))
+ if (mddev->array_sectors & (new_chunk-1))
/* not factor of array size */
return -EINVAL;
}
/* They look valid */
-
- if (new_layout >= 0)
- mddev->new_layout = new_layout;
- if (new_chunk > 0)
- mddev->new_chunk = new_chunk;
-
- return 0;
+ return check_reshape(mddev);
}
static void *raid5_takeover(mddev_t *mddev)
@@ -5227,8 +5278,6 @@ static void *raid5_takeover(mddev_t *mddev)
* raid1 - if there are two drives. We need to know the chunk size
* raid4 - trivial - just use a raid4 layout.
* raid6 - Providing it is a *_6 layout
- *
- * For now, just do raid1
*/
if (mddev->level == 1)
@@ -5310,12 +5359,11 @@ static struct mdk_personality raid6_personality =
.sync_request = sync_request,
.resize = raid5_resize,
.size = raid5_size,
- .check_reshape = raid5_check_reshape,
+ .check_reshape = raid6_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
- .reconfig = raid6_reconfig,
};
static struct mdk_personality raid5_personality =
{
@@ -5338,7 +5386,6 @@ static struct mdk_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
- .reconfig = raid5_reconfig,
};
static struct mdk_personality raid4_personality =