diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 395 |
1 files changed, 297 insertions, 98 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index c28a120b416..5fc326d3970 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -195,7 +195,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define ITERATE_MDDEV(mddev,tmp) \ +#define for_each_mddev(mddev,tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ tmp = all_mddevs.next; \ @@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit) spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); new->reshape_position = MaxSector; + new->resync_max = MaxSector; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -310,7 +311,7 @@ static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) mdk_rdev_t * rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->desc_nr == nr) return rdev; } @@ -322,7 +323,7 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->bdev->bd_dev == dev) return rdev; } @@ -773,12 +774,16 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) __u64 ev1 = md_event(sb); rdev->raid_disk = -1; - rdev->flags = 0; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + clear_bit(BarriersNotsupp, &rdev->flags); + if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; - mddev->persistent = ! sb->not_persistent; + mddev->external = 0; mddev->chunk_size = sb->chunk_size; mddev->ctime = sb->ctime; mddev->utime = sb->utime; @@ -904,7 +909,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->size = mddev->size; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; - sb->not_persistent = !mddev->persistent; + sb->not_persistent = 0; sb->utime = mddev->utime; sb->state = 0; sb->events_hi = (mddev->events>>32); @@ -938,7 +943,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { mdp_disk_t *d; int desc_nr; if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) @@ -1153,11 +1158,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; - rdev->flags = 0; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + clear_bit(BarriersNotsupp, &rdev->flags); + if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; - mddev->persistent = 1; + mddev->external = 0; mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); @@ -1286,7 +1295,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } max_dev = 0; - ITERATE_RDEV(mddev,rdev2,tmp) + rdev_for_each(rdev2, tmp, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1295,7 +1304,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1333,8 +1342,8 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) struct list_head *tmp, *tmp2; mdk_rdev_t *rdev, *rdev2; - ITERATE_RDEV(mddev1,rdev,tmp) - ITERATE_RDEV(mddev2, rdev2, tmp2) + rdev_for_each(rdev, tmp, mddev1) + rdev_for_each(rdev2, tmp2, mddev2) if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) return 1; @@ -1401,7 +1410,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) goto fail; } list_add(&rdev->same_set, &mddev->disks); - bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); + bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); return 0; fail: @@ -1410,10 +1419,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) return err; } -static void delayed_delete(struct work_struct *ws) +static void md_delayed_delete(struct work_struct *ws) { mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); kobject_del(&rdev->kobj); + kobject_put(&rdev->kobj); } static void unbind_rdev_from_array(mdk_rdev_t * rdev) @@ -1432,7 +1442,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state" */ - INIT_WORK(&rdev->del_work, delayed_delete); + INIT_WORK(&rdev->del_work, md_delayed_delete); + kobject_get(&rdev->kobj); schedule_work(&rdev->del_work); } @@ -1441,7 +1452,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; @@ -1453,13 +1464,15 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, rdev); + err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); if (err) { printk(KERN_ERR "md: could not bd_claim %s.\n", bdevname(bdev, b)); blkdev_put(bdev); return err; } + if (!shared) + set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } @@ -1503,7 +1516,7 @@ static void export_array(mddev_t *mddev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -1581,17 +1594,17 @@ static void md_print_devices(void) printk("md: **********************************\n"); printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); printk("md: **********************************\n"); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { if (mddev->bitmap) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - ITERATE_RDEV(mddev,rdev,tmp2) + rdev_for_each(rdev, tmp2, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - ITERATE_RDEV(mddev,rdev,tmp2) + rdev_for_each(rdev, tmp2, mddev) print_rdev(rdev); } printk("md: **********************************\n"); @@ -1610,7 +1623,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) mdk_rdev_t *rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -1696,18 +1709,20 @@ repeat: MD_BUG(); mddev->events --; } - sync_sbs(mddev, nospares); /* * do not write anything to disk if using * nonpersistent superblocks */ if (!mddev->persistent) { - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + if (!mddev->external) + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); dprintk(KERN_INFO @@ -1715,7 +1730,7 @@ repeat: mdname(mddev),mddev->in_sync); bitmap_update_sb(mddev->bitmap); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); if (rdev->sb_loaded != 1) @@ -1785,7 +1800,7 @@ static ssize_t state_show(mdk_rdev_t *rdev, char *page) { char *sep = ""; - int len=0; + size_t len = 0; if (test_bit(Faulty, &rdev->flags)) { len+= sprintf(page+len, "%sfaulty",sep); @@ -1887,20 +1902,45 @@ static ssize_t slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; + int err; + char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; else if (e==buf || (*e && *e!= '\n')) return -EINVAL; - if (rdev->mddev->pers) - /* Cannot set slot in active array (yet) */ - return -EBUSY; - if (slot >= rdev->mddev->raid_disks) - return -ENOSPC; - rdev->raid_disk = slot; - /* assume it is working */ - rdev->flags = 0; - set_bit(In_sync, &rdev->flags); + if (rdev->mddev->pers) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. + * For now we only support removing + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ + if (slot != -1) + return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ + if (rdev->mddev->pers->hot_add_disk == NULL) + return -EINVAL; + err = rdev->mddev->pers-> + hot_remove_disk(rdev->mddev, rdev->raid_disk); + if (err) + return err; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; + rdev->raid_disk = slot; + /* assume it is working */ + clear_bit(Faulty, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + set_bit(In_sync, &rdev->flags); + } return len; } @@ -1923,6 +1963,10 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers) return -EBUSY; + if (rdev->size && rdev->mddev->external) + /* Must set offset before size, so overlap checks + * can be sane */ + return -EBUSY; rdev->data_offset = offset; return len; } @@ -1936,16 +1980,69 @@ rdev_size_show(mdk_rdev_t *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->size); } +static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +{ + /* check if two start/length pairs overlap */ + if (s1+l1 <= s2) + return 0; + if (s2+l2 <= s1) + return 0; + return 1; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; unsigned long long size = simple_strtoull(buf, &e, 10); + unsigned long long oldsize = rdev->size; if (e==buf || (*e && *e != '\n')) return -EINVAL; if (rdev->mddev->pers) return -EBUSY; rdev->size = size; + if (size > oldsize && rdev->mddev->external) { + /* need to check that all other rdevs with the same ->bdev + * do not overlap. We need to unlock the mddev to avoid + * a deadlock. We have already changed rdev->size, and if + * we have to change it back, we will have the lock again. + */ + mddev_t *mddev; + int overlap = 0; + struct list_head *tmp, *tmp2; + + mddev_unlock(rdev->mddev); + for_each_mddev(mddev, tmp) { + mdk_rdev_t *rdev2; + + mddev_lock(mddev); + rdev_for_each(rdev2, tmp2, mddev) + if (test_bit(AllReserved, &rdev2->flags) || + (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->size, + rdev2->data_offset, rdev2->size))) { + overlap = 1; + break; + } + mddev_unlock(mddev); + if (overlap) { + mddev_put(mddev); + break; + } + } + mddev_lock(rdev->mddev); + if (overlap) { + /* Someone else could have slipped in a size + * change here, but doing so is just silly. + * We put oldsize back because we *know* it is + * safe, and trust userspace not to race with + * itself + */ + rdev->size = oldsize; + return -EBUSY; + } + } if (size < rdev->mddev->size || rdev->mddev->size == 0) rdev->mddev->size = size; return len; @@ -1980,12 +2077,18 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + int rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - return entry->store(rdev, page, length); + rv = mddev_lock(rdev->mddev); + if (!rv) { + rv = entry->store(rdev, page, length); + mddev_unlock(rdev->mddev); + } + return rv; } static void rdev_free(struct kobject *ko) @@ -2029,7 +2132,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi if ((err = alloc_disk_sb(rdev))) goto abort_free; - err = lock_rdev(rdev, newdev); + err = lock_rdev(rdev, newdev, super_format == -2); if (err) goto abort_free; @@ -2099,7 +2202,7 @@ static void analyze_sbs(mddev_t * mddev) char b[BDEVNAME_SIZE]; freshest = NULL; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -2120,7 +2223,7 @@ static void analyze_sbs(mddev_t * mddev) validate_super(mddev, freshest); i = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev != freshest) if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { @@ -2215,7 +2318,7 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { - int rv = len; + ssize_t rv = len; if (mddev->pers) return -EBUSY; if (len == 0) @@ -2425,6 +2528,8 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; + else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + st = write_pending; else if (mddev->safemode) st = active_idle; else @@ -2455,11 +2560,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->active) > 1) - return -EBUSY; - err = do_md_stop(mddev, 0); - } + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 0); break; case inactive: /* stopping an active array */ @@ -2467,7 +2570,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (atomic_read(&mddev->active) > 1) return -EBUSY; err = do_md_stop(mddev, 2); - } + } else + err = 0; /* already inactive */ break; case suspended: break; /* not supported yet */ @@ -2495,9 +2599,15 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) restart_array(mddev); spin_lock_irq(&mddev->write_lock); if (atomic_read(&mddev->writes_pending) == 0) { - mddev->in_sync = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - } + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, + &mddev->flags); + } + err = 0; + } else + err = -EBUSY; spin_unlock_irq(&mddev->write_lock); } else { mddev->ro = 0; @@ -2508,7 +2618,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->external) + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2574,7 +2685,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) if (err < 0) goto out; } - } else + } else if (mddev->external) + rdev = md_import_device(dev, -2, -1); + else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) @@ -2659,7 +2772,9 @@ __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metdata version. - * This is either 'none' for arrays with externally managed metadata, + * This is one of + * 'none' for arrays with no metadata (good luck...) + * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t @@ -2668,6 +2783,8 @@ metadata_show(mddev_t *mddev, char *page) if (mddev->persistent) return sprintf(page, "%d.%d\n", mddev->major_version, mddev->minor_version); + else if (mddev->external) + return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n"); } @@ -2682,6 +2799,21 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) if (cmd_match(buf, "none")) { mddev->persistent = 0; + mddev->external = 0; + mddev->major_version = 0; + mddev->minor_version = 90; + return len; + } + if (strncmp(buf, "external:", 9) == 0) { + size_t namelen = len-9; + if (namelen >= sizeof(mddev->metadata_type)) + namelen = sizeof(mddev->metadata_type)-1; + strncpy(mddev->metadata_type, buf+9, namelen); + mddev->metadata_type[namelen] = 0; + if (namelen && mddev->metadata_type[namelen-1] == '\n') + mddev->metadata_type[--namelen] = 0; + mddev->persistent = 0; + mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; return len; @@ -2698,6 +2830,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; + mddev->external = 0; return len; } @@ -2865,6 +2998,43 @@ sync_completed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t +max_sync_show(mddev_t *mddev, char *page) +{ + if (mddev->resync_max == MaxSector) + return sprintf(page, "max\n"); + else + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_max); +} +static ssize_t +max_sync_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (strncmp(buf, "max", 3) == 0) + mddev->resync_max = MaxSector; + else { + char *ep; + unsigned long long max = simple_strtoull(buf, &ep, 10); + if (ep == buf || (*ep != 0 && *ep != '\n')) + return -EINVAL; + if (max < mddev->resync_max && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev->chunk_size) { + if (max & (sector_t)((mddev->chunk_size>>9)-1)) + return -EINVAL; + } + mddev->resync_max = max; + } + wake_up(&mddev->recovery_wait); + return len; +} + +static struct md_sysfs_entry md_max_sync = +__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); + +static ssize_t suspend_lo_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); @@ -2974,6 +3144,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_max.attr, &md_sync_speed.attr, &md_sync_completed.attr, + &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, &md_bitmap.attr, @@ -3118,8 +3289,11 @@ static int do_md_run(mddev_t * mddev) /* * Analyze all RAID superblock(s) */ - if (!mddev->raid_disks) + if (!mddev->raid_disks) { + if (!mddev->persistent) + return -EINVAL; analyze_sbs(mddev); + } chunk_size = mddev->chunk_size; @@ -3143,7 +3317,7 @@ static int do_md_run(mddev_t * mddev) } /* devices must have minimum size of one chunk */ - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->size < chunk_size / 1024) { @@ -3170,7 +3344,7 @@ static int do_md_run(mddev_t * mddev) * the only valid external interface is through the md * device. */ - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -3236,8 +3410,8 @@ static int do_md_run(mddev_t * mddev) mdk_rdev_t *rdev2; struct list_head *tmp2; int warned = 0; - ITERATE_RDEV(mddev, rdev, tmp) { - ITERATE_RDEV(mddev, rdev2, tmp2) { + rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev2, tmp2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -3297,7 +3471,7 @@ static int do_md_run(mddev_t * mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) if (rdev->raid_disk >= 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -3330,7 +3504,7 @@ static int do_md_run(mddev_t * mddev) if (mddev->degraded && !mddev->sync_thread) { struct list_head *rtmp; int spares = 0; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -3507,14 +3681,14 @@ static int do_md_stop(mddev_t * mddev, int mode) } mddev->bitmap_offset = 0; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) if (rdev->raid_disk >= 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } - /* make sure all delayed_delete calls have finished */ + /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); export_array(mddev); @@ -3523,7 +3697,10 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->size = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; + mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", @@ -3546,7 +3723,7 @@ static void autorun_array(mddev_t *mddev) printk(KERN_INFO "md: running: "); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -3589,7 +3766,7 @@ static void autorun_devices(int part) printk(KERN_INFO "md: considering %s ...\n", bdevname(rdev0->bdev,b)); INIT_LIST_HEAD(&candidates); - ITERATE_RDEV_PENDING(rdev,tmp) + rdev_for_each_list(rdev, tmp, pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { printk(KERN_INFO "md: adding %s ...\n", bdevname(rdev->bdev,b)); @@ -3632,7 +3809,8 @@ static void autorun_devices(int part) mddev_unlock(mddev); } else { printk(KERN_INFO "md: created %s\n", mdname(mddev)); - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + mddev->persistent = 1; + rdev_for_each_list(rdev, tmp, candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -3643,7 +3821,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + rdev_for_each_list(rdev, tmp, candidates) export_rdev(rdev); mddev_put(mddev); } @@ -3673,7 +3851,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) struct list_head *tmp; nr=working=active=failed=spare=0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -3919,8 +4097,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->raid_disk = -1; - rdev->flags = 0; - if (rdev->raid_disk < mddev->raid_disks) if (info->state & (1<<MD_DISK_SYNC)) set_bit(In_sync, &rdev->flags); @@ -4165,13 +4341,15 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) else mddev->recovery_cp = 0; mddev->persistent = ! info->not_persistent; + mddev->external = 0; mddev->layout = info->layout; mddev->chunk_size = info->chunk_size; mddev->max_disks = MD_SB_DISKS; - mddev->flags = 0; + if (mddev->persistent) + mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -4213,7 +4391,7 @@ static int update_size(mddev_t *mddev, unsigned long size) */ if (mddev->sync_thread) return -EBUSY; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { sector_t avail; avail = rdev->size * 2; @@ -4471,9 +4649,10 @@ static int md_ioctl(struct inode *inode, struct file *file, */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ - if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY - && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE - && cmd != GET_BITMAP_FILE) { + if ((!mddev->raid_disks && !mddev->external) + && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE + && cmd != GET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } @@ -4757,7 +4936,7 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "unused devices: "); - ITERATE_RDEV_PENDING(rdev,tmp) { + rdev_for_each_list(rdev, tmp, pending_raid_disks) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, "%s ", @@ -4953,7 +5132,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } size = 0; - ITERATE_RDEV(mddev,rdev,tmp2) { + rdev_for_each(rdev, tmp2, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -4982,7 +5161,10 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->major_version, mddev->minor_version); } - } else + } else if (mddev->external) + seq_printf(seq, " super external:%s", + mddev->metadata_type); + else seq_printf(seq, " super non-persistent"); if (mddev->pers) { @@ -5106,7 +5288,7 @@ static int is_mddev_idle(mddev_t *mddev) long curr_events; idle = 1; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, sectors[0]) + disk_stat_read(disk, sectors[1]) - @@ -5283,7 +5465,7 @@ void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto skip; } - ITERATE_MDDEV(mddev2,tmp) { + for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; if (mddev2->curr_resync && @@ -5333,7 +5515,7 @@ void md_do_sync(mddev_t *mddev) /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; j = MaxSector; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -5381,8 +5563,16 @@ void md_do_sync(mddev_t *mddev) sector_t sectors; skipped = 0; + if (j >= mddev->resync_max) { + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + wait_event(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + } + if (kthread_should_stop()) + goto interrupted; sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); + currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_ERR, &mddev->recovery); goto out; @@ -5424,15 +5614,9 @@ void md_do_sync(mddev_t *mddev) } - if (kthread_should_stop()) { - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } + if (kthread_should_stop()) + goto interrupted; + /* * this loop exits only if either when we are slower than @@ -5484,7 +5668,7 @@ void md_do_sync(mddev_t *mddev) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -5496,9 +5680,22 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; + mddev->resync_max = MaxSector; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); + return; + + interrupted: + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } EXPORT_SYMBOL_GPL(md_do_sync); @@ -5509,8 +5706,9 @@ static int remove_and_add_spares(mddev_t *mddev) struct list_head *rtmp; int spares = 0; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && + !mddev->external && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { @@ -5524,7 +5722,7 @@ static int remove_and_add_spares(mddev_t *mddev) } if (mddev->degraded) { - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; @@ -5589,7 +5787,7 @@ void md_check_recovery(mddev_t *mddev) } if ( ! ( - mddev->flags || + (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->safemode == 1) || @@ -5605,7 +5803,8 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; @@ -5637,7 +5836,7 @@ void md_check_recovery(mddev_t *mddev) * information must be scrapped */ if (!mddev->degraded) - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) rdev->saved_raid_disk = -1; mddev->recovery = 0; @@ -5714,7 +5913,7 @@ static int md_notify_reboot(struct notifier_block *this, printk(KERN_INFO "md: stopping all md devices.\n"); - ITERATE_MDDEV(mddev,tmp) + for_each_mddev(mddev, tmp) if (mddev_trylock(mddev)) { do_md_stop (mddev, 1); mddev_unlock(mddev); @@ -5848,7 +6047,7 @@ static __exit void md_exit(void) unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); remove_proc_entry("mdstat", NULL); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { struct gendisk *disk = mddev->gendisk; if (!disk) continue; |