diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 383 |
1 files changed, 250 insertions, 133 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 15348c393b5..96c690279fc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -53,6 +53,7 @@ #include <linux/slab.h> #include "md.h" #include "raid5.h" +#include "raid0.h" #include "bitmap.h" /* @@ -276,12 +277,13 @@ out: return sh; } -static void shrink_buffers(struct stripe_head *sh, int num) +static void shrink_buffers(struct stripe_head *sh) { struct page *p; int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num ; i++) { + for (i = 0; i < num ; i++) { p = sh->dev[i].page; if (!p) continue; @@ -290,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) } } -static int grow_buffers(struct stripe_head *sh, int num) +static int grow_buffers(struct stripe_head *sh) { int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num; i++) { + for (i = 0; i < num; i++) { struct page *page; if (!(page = alloc_page(GFP_KERNEL))) { @@ -363,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, return NULL; } +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int has_failed(raid5_conf_t *conf) +{ + int degraded; + int i; + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + return 0; +} + static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); @@ -1239,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - int disks = max(conf->raid_disks, conf->previous_raid_disks); sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); + memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif - if (grow_buffers(sh, disks)) { - shrink_buffers(sh, disks); + if (grow_buffers(sh)) { + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); return 0; } @@ -1467,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh, conf->pool_size); + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); return 1; @@ -1509,7 +1578,7 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk_rl(KERN_INFO "raid5:%s: read error corrected" + printk_rl(KERN_INFO "md/raid:%s: read error corrected" " (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, (unsigned long long)(sh->sector @@ -1529,7 +1598,7 @@ static void raid5_end_read_request(struct bio * bi, int error) atomic_inc(&rdev->read_errors); if (conf->mddev->degraded >= conf->max_degraded) printk_rl(KERN_WARNING - "raid5:%s: read error not correctable " + "md/raid:%s: read error not correctable " "(sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)(sh->sector @@ -1538,7 +1607,7 @@ static void raid5_end_read_request(struct bio * bi, int error) else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ printk_rl(KERN_WARNING - "raid5:%s: read error NOT corrected!! " + "md/raid:%s: read error NOT corrected!! " "(sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)(sh->sector @@ -1547,7 +1616,7 @@ static void raid5_end_read_request(struct bio * bi, int error) else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING - "raid5:%s: Too many read errors, failing device %s.\n", + "md/raid:%s: Too many read errors, failing device %s.\n", mdname(conf->mddev), bdn); else retry = 1; @@ -1619,8 +1688,8 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - pr_debug("raid5: error called\n"); + raid5_conf_t *conf = mddev->private; + pr_debug("raid456: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1636,9 +1705,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); printk(KERN_ALERT - "raid5: Disk failure on %s, disabling device.\n" - "raid5: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + "md/raid:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } } @@ -1714,8 +1787,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = data_disks; break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1832,10 +1903,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = raid_disks - 1; break; - default: - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1898,8 +1966,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) case ALGORITHM_PARITY_N: break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1958,8 +2024,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) i -= 1; break; default: - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1972,7 +2036,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { - printk(KERN_ERR "compute_blocknr: map not correct\n"); + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); return 0; } return r_sector; @@ -2966,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx toread %p read %p write %p " "written %p\n", i, dev->flags, dev->toread, dev->read, @@ -3003,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* could be in-sync depending on recovery/reshape status */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { s.failed++; s.failed_num = i; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -3247,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -3285,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* in sync if before recovery_offset */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { if (s.failed < 2) r6s.failed_num[s.failed] = i; s.failed++; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -3709,10 +3792,10 @@ static void raid5_align_endio(struct bio *bi, int error) bio_put(bi); - mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; - conf = mddev->private; rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; + mddev = rdev->mddev; + conf = mddev->private; rdev_dec_pending(rdev, conf->mddev); @@ -3749,9 +3832,8 @@ static int bio_fits_rdev(struct bio *bi) } -static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) +static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) { - mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; int dd_idx; struct bio* align_bi; @@ -3866,16 +3948,15 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) return sh; } -static int make_request(struct request_queue *q, struct bio * bi) +static int make_request(mddev_t *mddev, struct bio * bi) { - mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int cpu, remaining; + int remaining; if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { /* Drain all pending writes. We only really need @@ -3890,15 +3971,9 @@ static int make_request(struct request_queue *q, struct bio * bi) md_write_start(mddev, bi); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bi)); - part_stat_unlock(); - if (rw == READ && mddev->reshape_position == MaxSector && - chunk_aligned_read(q,bi)) + chunk_aligned_read(mddev,bi)) return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); @@ -3946,7 +4021,7 @@ static int make_request(struct request_queue *q, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); - pr_debug("raid5: make_request, sector %llu logical %llu\n", + pr_debug("raid456: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -4054,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * As the reads complete, handle_stripe will copy the data * into the destination stripe and release that stripe. */ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; @@ -4263,7 +4338,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* FIXME go_faster isn't used */ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t max_sector = mddev->dev_sectors; int sync_blocks; @@ -4656,7 +4731,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, kfree(percpu->scribble); pr_err("%s: failed memory allocation for cpu%ld\n", __func__, cpu); - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); } break; case CPU_DEAD: @@ -4725,7 +4800,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (mddev->new_level != 5 && mddev->new_level != 4 && mddev->new_level != 6) { - printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", mdname(mddev), mddev->new_level); return ERR_PTR(-EIO); } @@ -4733,12 +4808,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) && !algorithm_valid_raid5(mddev->new_layout)) || (mddev->new_level == 6 && !algorithm_valid_raid6(mddev->new_layout))) { - printk(KERN_ERR "raid5: %s: layout %d not supported\n", + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", mdname(mddev), mddev->new_layout); return ERR_PTR(-EIO); } if (mddev->new_level == 6 && mddev->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", mdname(mddev), mddev->raid_disks); return ERR_PTR(-EINVAL); } @@ -4746,8 +4821,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (!mddev->new_chunk_sectors || (mddev->new_chunk_sectors << 9) % PAGE_SIZE || !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - mddev->new_chunk_sectors << 9, mdname(mddev)); + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); return ERR_PTR(-EINVAL); } @@ -4789,7 +4864,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (raid5_alloc_percpu(conf) != 0) goto abort; - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); list_for_each_entry(rdev, &mddev->disks, same_set) { raid_disk = rdev->raid_disk; @@ -4802,9 +4877,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "raid5: device %s operational as raid" - " disk %d\n", bdevname(rdev->bdev,b), - raid_disk); + printk(KERN_INFO "md/raid:%s: device %s operational as raid" + " disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; @@ -4828,16 +4903,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; if (grow_stripes(conf, conf->max_nr_stripes)) { printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); + "md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); goto abort; } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", + mdname(mddev), memory); conf->thread = md_register_thread(raid5d, mddev, NULL); if (!conf->thread) { printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", + "md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); goto abort; } @@ -4888,7 +4964,7 @@ static int run(mddev_t *mddev) sector_t reshape_offset = 0; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid5: %s is not clean" + printk(KERN_NOTICE "md/raid:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); if (mddev->reshape_position != MaxSector) { @@ -4902,7 +4978,7 @@ static int run(mddev_t *mddev) int max_degraded = (mddev->level == 6 ? 2 : 1); if (mddev->new_level != mddev->level) { - printk(KERN_ERR "raid5: %s: unsupported reshape " + printk(KERN_ERR "md/raid:%s: unsupported reshape " "required - aborting.\n", mdname(mddev)); return -EINVAL; @@ -4915,8 +4991,8 @@ static int run(mddev_t *mddev) here_new = mddev->reshape_position; if (sector_div(here_new, mddev->new_chunk_sectors * (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); + printk(KERN_ERR "md/raid:%s: reshape_position not " + "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } reshape_offset = here_new * mddev->new_chunk_sectors; @@ -4937,8 +5013,9 @@ static int run(mddev_t *mddev) if ((here_new * mddev->new_chunk_sectors != here_old * mddev->chunk_sectors) || mddev->ro == 0) { - printk(KERN_ERR "raid5: in-place reshape must be started" - " in read-only mode - aborting\n"); + printk(KERN_ERR "md/raid:%s: in-place reshape must be started" + " in read-only mode - aborting\n", + mdname(mddev)); return -EINVAL; } } else if (mddev->delta_disks < 0 @@ -4947,11 +5024,13 @@ static int run(mddev_t *mddev) : (here_new * mddev->new_chunk_sectors >= here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); + printk(KERN_ERR "md/raid:%s: reshape_position too early for " + "auto-recovery - aborting.\n", + mdname(mddev)); return -EINVAL; } - printk(KERN_INFO "raid5: reshape will continue\n"); + printk(KERN_INFO "md/raid:%s: reshape will continue\n", + mdname(mddev)); /* OK, we should be able to continue; */ } else { BUG_ON(mddev->level != mddev->new_level); @@ -4978,8 +5057,10 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk < 0) continue; - if (test_bit(In_sync, &rdev->flags)) + if (test_bit(In_sync, &rdev->flags)) { working_disks++; + continue; + } /* This disc is not fully in-sync. However if it * just stored parity (beyond the recovery_offset), * when we don't need to be concerned about the @@ -4993,18 +5074,6 @@ static int run(mddev_t *mddev) mddev->minor_version > 90) rdev->recovery_offset = reshape_offset; - printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", - rdev->raid_disk, working_disks, conf->prev_algo, - conf->previous_raid_disks, conf->max_degraded, - conf->algorithm, conf->raid_disks, - only_parity(rdev->raid_disk, - conf->prev_algo, - conf->previous_raid_disks, - conf->max_degraded), - only_parity(rdev->raid_disk, - conf->algorithm, - conf->raid_disks, - conf->max_degraded)); if (rdev->recovery_offset < reshape_offset) { /* We need to check old and new layout */ if (!only_parity(rdev->raid_disk, @@ -5024,8 +5093,8 @@ static int run(mddev_t *mddev) mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - working_disks); - if (mddev->degraded > conf->max_degraded) { - printk(KERN_ERR "raid5: not enough operational devices for %s" + if (has_failed(conf)) { + printk(KERN_ERR "md/raid:%s: not enough operational devices" " (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; @@ -5039,32 +5108,32 @@ static int run(mddev_t *mddev) mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING - "raid5: starting dirty degraded array: %s" - "- data corruption possible.\n", + "md/raid:%s: starting dirty degraded array" + " - data corruption possible.\n", mdname(mddev)); else { printk(KERN_ERR - "raid5: cannot start dirty degraded array for %s\n", + "md/raid:%s: cannot start dirty degraded array.\n", mdname(mddev)); goto abort; } } if (mddev->degraded == 0) - printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" + " devices, algorithm %d\n", mdname(mddev), conf->level, mddev->raid_disks-mddev->degraded, mddev->raid_disks, mddev->new_layout); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d" - " out of %d devices, algorithm %d\n", conf->level, - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, mddev->new_layout); + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" + " out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks - mddev->degraded, + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); if (conf->reshape_progress != MaxSector) { - printk("...ok start reshape thread\n"); conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -5087,9 +5156,11 @@ static int run(mddev_t *mddev) } /* Ok, everything is just fine now */ - if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + if (mddev->to_remove == &raid5_attrs_group) + mddev->to_remove = NULL; + else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING - "raid5: failed to create sysfs attributes for %s\n", + "md/raid:%s: failed to create sysfs attributes.\n", mdname(mddev)); mddev->queue->queue_lock = &conf->device_lock; @@ -5119,22 +5190,21 @@ abort: free_conf(conf); } mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); return -EIO; } - - static int stop(mddev_t *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; md_unregister_thread(mddev->thread); mddev->thread = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ free_conf(conf); - mddev->private = &raid5_attrs_group; + mddev->private = NULL; + mddev->to_remove = &raid5_attrs_group; return 0; } @@ -5175,7 +5245,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf) static void status(struct seq_file *seq, mddev_t *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, @@ -5197,21 +5267,22 @@ static void print_raid5_conf (raid5_conf_t *conf) int i; struct disk_info *tmp; - printk("RAID5 conf printout:\n"); + printk(KERN_DEBUG "RAID conf printout:\n"); if (!conf) { printk("(conf==NULL)\n"); return; } - printk(" --- rd:%d wd:%d\n", conf->raid_disks, - conf->raid_disks - conf->mddev->degraded); + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->disks + i; if (tmp->rdev) - printk(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + i, !test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev, b)); } } @@ -5224,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { unsigned long flags; @@ -5259,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded && + !has_failed(conf) && number < conf->raid_disks) { err = -EBUSY; goto abort; @@ -5287,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5334,7 +5406,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_size(mddev, sectors, mddev->raid_disks)) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; @@ -5360,7 +5431,8 @@ static int check_stripe_cache(mddev_t *mddev) > conf->max_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) / STRIPE_SIZE)*4); return 0; @@ -5379,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) return -EINVAL; if (mddev->delta_disks < 0) { /* We might be able to shrink, but the devices must @@ -5431,7 +5503,7 @@ static int raid5_start_reshape(mddev_t *mddev) */ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) < mddev->array_sectors) { - printk(KERN_ERR "md: %s: array size must be reduced " + printk(KERN_ERR "md/raid:%s: array size must be reduced " "before number of disks\n", mdname(mddev)); return -EINVAL; } @@ -5454,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + if (mddev->delta_disks >= 0) + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { @@ -5469,15 +5546,15 @@ static int raid5_start_reshape(mddev_t *mddev) if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) printk(KERN_WARNING - "raid5: failed to create " - " link %s for %s\n", - nm, mdname(mddev)); + "md/raid:%s: failed to create " + " link %s\n", + mdname(mddev), nm); } else break; } /* When a reshape changes the number of devices, ->degraded - * is measured against the large of the pre and post number of + * is measured against the larger of the pre and post number of * devices.*/ if (mddev->delta_disks > 0) { spin_lock_irqsave(&conf->device_lock, flags); @@ -5548,7 +5625,6 @@ static void raid5_finish_reshape(mddev_t *mddev) if (mddev->delta_disks > 0) { md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); } else { int d; @@ -5613,6 +5689,29 @@ static void raid5_quiesce(mddev_t *mddev, int state) } +static void *raid45_takeover_raid0(mddev_t *mddev, int level) +{ + struct raid0_private_data *raid0_priv = mddev->private; + + /* for raid0 takeover only one zone is supported */ + if (raid0_priv->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + mddev->new_level = level; + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks += 1; + mddev->delta_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + return setup_conf(mddev); +} + + static void *raid5_takeover_raid1(mddev_t *mddev) { int chunksect; @@ -5737,12 +5836,13 @@ static int raid6_check_reshape(mddev_t *mddev) static void *raid5_takeover(mddev_t *mddev) { /* raid5 can take over: - * raid0 - if all devices are the same - make it a raid4 layout + * raid0 - if there is only one strip zone - make it a raid4 layout * raid1 - if there are two drives. We need to know the chunk size * raid4 - trivial - just use a raid4 layout. * raid6 - Providing it is a *_6 layout */ - + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 5); if (mddev->level == 1) return raid5_takeover_raid1(mddev); if (mddev->level == 4) { @@ -5756,6 +5856,22 @@ static void *raid5_takeover(mddev_t *mddev) return ERR_PTR(-EINVAL); } +static void *raid4_takeover(mddev_t *mddev) +{ + /* raid4 can take over: + * raid0 - if there is only one strip zone + * raid5 - if layout is right + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 4); + if (mddev->level == 5 && + mddev->layout == ALGORITHM_PARITY_N) { + mddev->new_layout = 0; + mddev->new_level = 4; + return setup_conf(mddev); + } + return ERR_PTR(-EINVAL); +} static struct mdk_personality raid5_personality; @@ -5871,6 +5987,7 @@ static struct mdk_personality raid4_personality = .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid4_takeover, }; static int __init raid5_init(void) |