From 87162a28acceda05dadf10a8333183f6d2b55265 Mon Sep 17 00:00:00 2001 From: "viro@ZenIV.linux.org.uk" Date: Fri, 9 Sep 2005 20:36:43 +0100 Subject: [PATCH] trivial __user annotations (md) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 20ca80b7dc2..373ab92e367 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2087,7 +2087,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) return 0; } -static int get_bitmap_file(mddev_t * mddev, void * arg) +static int get_bitmap_file(mddev_t * mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr, *buf = NULL; @@ -2781,7 +2781,7 @@ static int md_ioctl(struct inode *inode, struct file *file, goto done_unlock; case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, (void *)arg); + err = get_bitmap_file(mddev, argp); goto done_unlock; case GET_DISK_INFO: -- cgit v1.2.3-70-g09d2 From 1923b99a0f4748aa6be0b9b9523ce224a3449b17 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:43 -0700 Subject: [PATCH] md: don't allow new md/bitmap file to be set if one already exists ... otherwise we loose a reference and can never free the file. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 373ab92e367..63c56616518 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2430,7 +2430,7 @@ static int set_bitmap_file(mddev_t *mddev, int fd) { int err; - if (mddev->pers) + if (mddev->pers || mddev->bitmap_file) return -EBUSY; mddev->bitmap_file = fget(fd); -- cgit v1.2.3-70-g09d2 From 36fa30636fb84b209210299684e1be66d9e58217 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:45 -0700 Subject: [PATCH] md: all hot-add and hot-remove of md intent logging bitmaps Both file-bitmaps and superblock bitmaps are supported. If you add a bitmap file on the array device, you lose. This introduces a 'default_bitmap_offset' field in mddev, as the ioctl used for adding a superblock bitmap doesn't have room for giving an offset. Later, this value will be setable via sysfs. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 104 ++++++++++++++++++++++++++++++++++++++-------- drivers/md/raid1.c | 30 +++++++++++++ include/linux/raid/md_k.h | 10 +++++ 3 files changed, 127 insertions(+), 17 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 63c56616518..ae654466dc2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -623,6 +623,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->size = sb->size; mddev->events = md_event(sb); mddev->bitmap_offset = 0; + mddev->default_bitmap_offset = MD_SB_BYTES >> 9; if (sb->state & (1<recovery_cp = MaxSector; @@ -648,7 +649,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) printk(KERN_WARNING "md: bitmaps only support for raid1\n"); return -EINVAL; } - mddev->bitmap_offset = (MD_SB_BYTES >> 9); + mddev->bitmap_offset = mddev->default_bitmap_offset; } } else if (mddev->pers == NULL) { @@ -939,6 +940,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->size = le64_to_cpu(sb->size)/2; mddev->events = le64_to_cpu(sb->events); mddev->bitmap_offset = 0; + mddev->default_bitmap_offset = 0; + if (mddev->minor_version == 0) + mddev->default_bitmap_offset = -(64*1024)/512; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); @@ -2073,6 +2077,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = 0; if (mddev->in_sync) info.state = (1<bitmap && mddev->bitmap_offset) + info.state = (1<pers || mddev->bitmap_file) - return -EBUSY; + if (mddev->pers) { + if (!mddev->pers->quiesce) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + /* we should be able to change the bitmap.. */ + } - mddev->bitmap_file = fget(fd); - if (mddev->bitmap_file == NULL) { - printk(KERN_ERR "%s: error: failed to get bitmap file\n", - mdname(mddev)); - return -EBADF; - } + if (fd >= 0) { + if (mddev->bitmap) + return -EEXIST; /* cannot add when bitmap is present */ + mddev->bitmap_file = fget(fd); - err = deny_bitmap_write_access(mddev->bitmap_file); - if (err) { - printk(KERN_ERR "%s: error: bitmap file is already in use\n", - mdname(mddev)); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; - } else + if (mddev->bitmap_file == NULL) { + printk(KERN_ERR "%s: error: failed to get bitmap file\n", + mdname(mddev)); + return -EBADF; + } + + err = deny_bitmap_write_access(mddev->bitmap_file); + if (err) { + printk(KERN_ERR "%s: error: bitmap file is already in use\n", + mdname(mddev)); + fput(mddev->bitmap_file); + mddev->bitmap_file = NULL; + return err; + } mddev->bitmap_offset = 0; /* file overrides offset */ + } else if (mddev->bitmap == NULL) + return -ENOENT; /* cannot remove what isn't there */ + err = 0; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + if (fd >= 0) + err = bitmap_create(mddev); + if (fd < 0 || err) + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } else if (fd < 0) { + if (mddev->bitmap_file) + fput(mddev->bitmap_file); + mddev->bitmap_file = NULL; + } + return err; } @@ -2528,6 +2560,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; + int state = 0; + + /* calculate expected state,ignoring low bits */ + if (mddev->bitmap && mddev->bitmap_offset) + state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || mddev->minor_version != info->minor_version || @@ -2536,12 +2573,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size ) + mddev->chunk_size != info->chunk_size || + /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ + ((state^info->state) & 0xfffffe00) + ) return -EINVAL; /* Check there is only one change */ if (mddev->size != info->size) cnt++; if (mddev->raid_disks != info->raid_disks) cnt++; if (mddev->layout != info->layout) cnt++; + if ((state ^ info->state) & (1< 1) return -EINVAL; @@ -2620,6 +2661,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) } } } + if ((state ^ info->state) & (1<pers->quiesce == NULL) + return -EINVAL; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + if (info->state & (1<bitmap) + return -EEXIST; + if (mddev->default_bitmap_offset == 0) + return -EINVAL; + mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->pers->quiesce(mddev, 1); + rv = bitmap_create(mddev); + if (rv) + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } else { + /* remove the bitmap */ + if (!mddev->bitmap) + return -ENOENT; + if (mddev->bitmap->file) + return -EINVAL; + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + mddev->bitmap_offset = 0; + } + } md_update_sb(mddev); return rv; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ace41c571ae..ba643e4bfac 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1565,6 +1565,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) return 0; } +void raid1_quiesce(mddev_t *mddev, int state) +{ + conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 0: + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, + conf->resync_lock, raid1_unplug(mddev->queue)); + spin_unlock_irq(&conf->resync_lock); + break; + case 1: + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_resume); + wake_up(&conf->wait_idle); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} + static mdk_personality_t raid1_personality = { @@ -1581,6 +1610,7 @@ static mdk_personality_t raid1_personality = .sync_request = sync_request, .resize = raid1_resize, .reshape = raid1_reshape, + .quiesce = raid1_quiesce, }; static int __init raid_init(void) diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 8c14ba565a4..817062bf735 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -278,6 +278,10 @@ struct mddev_s * start of bitmap. May be * negative, but not '0' */ + long default_bitmap_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ struct list_head all_mddevs; }; @@ -314,6 +318,12 @@ struct mdk_personality_s int (*resize) (mddev_t *mddev, sector_t sectors); int (*reshape) (mddev_t *mddev, int raid_disks); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (mddev_t *mddev, int state); }; -- cgit v1.2.3-70-g09d2 From 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:45 -0700 Subject: [PATCH] md: support write-mostly device in raid1 This allows a device in a raid1 to be marked as "write mostly". Read requests will only be sent if there is no other option. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 18 +++++++++++ drivers/md/raid1.c | 76 ++++++++++++++++++++++++++++++++--------------- include/linux/raid/md_k.h | 3 ++ include/linux/raid/md_p.h | 11 +++++-- 4 files changed, 82 insertions(+), 26 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index ae654466dc2..f1ac356e656 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->level != LEVEL_MULTIPATH) { rdev->faulty = 0; + rdev->flags = 0; desc = sb->disks + rdev->desc_nr; if (desc->state & (1<in_sync = 1; rdev->raid_disk = desc->raid_disk; } + if (desc->state & (1<flags); } else /* MULTIPATH are always insync */ rdev->in_sync = 1; return 0; @@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) spare++; working++; } + if (test_bit(WriteMostly, &rdev2->flags)) + d->state |= (1<raid_disk = role; break; } + rdev->flags = 0; + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); } else /* MULTIPATH are always insync */ rdev->in_sync = 1; @@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.state |= (1<flags)) + info.state |= (1<saved_raid_disk = rdev->raid_disk; rdev->in_sync = 0; /* just to be sure */ + if (info->state & (1<flags); + rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) @@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->in_sync = 0; + if (info->state & (1<flags); + err = bind_rdev_to_array(rdev, mddev); if (err) { export_rdev(rdev); @@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v) char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) + seq_printf(seq, "(W)"); if (rdev->faulty) { seq_printf(seq, "(F)"); continue; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ba643e4bfac..28839a8193f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; + int wonly_disk = -1; const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; - mdk_rdev_t *new_rdev, *rdev; + mdk_rdev_t *rdev; rcu_read_lock(); /* - * Check if it if we can balance. We can balance on the whole + * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ @@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* Choose the first operation device, for consistancy */ new_disk = 0; - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { - new_disk++; - if (new_disk == conf->raid_disks) { - new_disk = -1; + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync + || test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[++new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + + if (new_disk == conf->raid_disks - 1) { + new_disk = wonly_disk; break; } } @@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { - new_disk = -1; - goto rb_out; + new_disk = wonly_disk; + break; } } + + if (new_disk < 0) + goto rb_out; + disk = new_disk; /* now disk == new_disk == starting point for search */ @@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) disk = conf->raid_disks; disk--; - if ((rdev=conf->mirrors[disk].rdev) == NULL || - !rdev->in_sync) + rdev = conf->mirrors[disk].rdev; + + if (!rdev || + !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { new_disk = disk; - new_rdev = rdev; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; - new_rdev = rdev; } } while (disk != conf->last_used); -rb_out: + rb_out: if (new_disk >= 0) { - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - atomic_inc(&new_rdev->nr_pending); - if (!new_rdev->in_sync) { + rdev = conf->mirrors[new_disk].rdev; + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (!rdev->in_sync) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ - atomic_dec(&new_rdev->nr_pending); + atomic_dec(&rdev->nr_pending); goto retry; } + conf->next_seq_sect = this_sector + sectors; + conf->last_used = new_disk; } rcu_read_unlock(); @@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t max_sector, nr_sectors; int disk; int i; + int wonly; int write_targets = 0; int sync_blocks; int still_degraded = 0; @@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ disk = conf->last_used; /* make sure disk is operational */ - + wonly = disk; while (conf->mirrors[disk].rdev == NULL || - !conf->mirrors[disk].rdev->in_sync) { + !conf->mirrors[disk].rdev->in_sync || + test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) + ) { + if (conf->mirrors[disk].rdev && + conf->mirrors[disk].rdev->in_sync) + wonly = disk; if (disk <= 0) disk = conf->raid_disks; disk--; - if (disk == conf->last_used) + if (disk == conf->last_used) { + disk = wonly; break; + } } conf->last_used = disk; atomic_inc(&conf->mirrors[disk].rdev->nr_pending); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 817062bf735..7ef78e15ce0 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -181,6 +181,9 @@ struct mdk_rdev_s int faulty; /* if faulty do not issue IO requests */ int in_sync; /* device is a full member of the array */ + unsigned long flags; /* Should include faulty and in_sync here. */ +#define WriteMostly 4 /* Avoid reading if at all possible */ + int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ int saved_raid_disk; /* role that device used to have in the diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index dc65cd43549..4f047f84fb1 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -79,6 +79,11 @@ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ @@ -193,7 +198,7 @@ struct mdp_superblock_1 { __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ - __u32 layout; /* only for raid5 currently */ + __u32 layout; /* only for raid5 and raid10 currently */ __u64 size; /* used size of component devices, in 512byte sectors */ __u32 chunksize; /* in 512byte sectors */ @@ -212,7 +217,9 @@ struct mdp_superblock_1 { __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ - __u8 pad2[64-56]; /* set to 0 when writing */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + __u8 pad2[64-57]; /* set to 0 when writing */ /* array state information - 64 bytes */ __u64 utime; /* 40 bits second, 24 btes microseconds */ -- cgit v1.2.3-70-g09d2 From 7b1e35f6d666693e8f376ce02242efca3ec09aaf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:50 -0700 Subject: [PATCH] md: allow hot-adding devices to arrays with non-persistant superblocks. It is possibly (and occasionally useful) to have a raid1 without persistent superblocks. The code in add_new_disk for adding a device to such an array always tries to read a superblock. This will obviously fail. So do the appropriate test and call md_import_device with appropriate args. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index f1ac356e656..866c704e008 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2226,8 +2226,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) mdname(mddev)); return -EINVAL; } - rdev = md_import_device(dev, mddev->major_version, - mddev->minor_version); + if (mddev->persistent) + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + else + rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { printk(KERN_WARNING "md: md_import_device returned %ld\n", -- cgit v1.2.3-70-g09d2 From 71c0805cb48462c99fbe0e5fcc6c12d7b9929c09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:51 -0700 Subject: [PATCH] md: allow md to load a superblock with feature-bit '1' set As this is used to flag an internal bitmap. Also, introduce symbolic names for feature bits. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 6 +++--- include/linux/raid/md_p.h | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 866c704e008..1be3f2de396 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -875,7 +875,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || - sb->feature_map != 0) + (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { @@ -954,7 +954,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = (4096-256)/2; - if ((le32_to_cpu(sb->feature_map) & 1) && + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_file == NULL ) { if (mddev->level != 1) { printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); @@ -1029,7 +1029,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); - sb->feature_map = cpu_to_le32(1); + sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } max_dev = 0; diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 4f047f84fb1..c100fa5d4bf 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -238,5 +238,10 @@ struct mdp_superblock_1 { __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ }; +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 + +#define MD_FEATURE_ALL 1 + #endif -- cgit v1.2.3-70-g09d2 From 0002b2718dd04da67c21f8a7830de8d95a9b0345 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:53 -0700 Subject: [PATCH] md: limit size of sb read/written to appropriate amount version-1 superblocks are not (normally) 4K long, and can be of variable size. Writing the full 4K can cause corruption (but only in non-default configurations). With this patch the super-block-flavour can choose a size to read, and set a size to write based on what it finds. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 20 +++++++++++++++----- include/linux/raid/md_k.h | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1be3f2de396..be7873c61b3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -393,7 +393,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, return ret; } -static int read_disk_sb(mdk_rdev_t * rdev) +static int read_disk_sb(mdk_rdev_t * rdev, int size) { char b[BDEVNAME_SIZE]; if (!rdev->sb_page) { @@ -404,7 +404,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; return 0; @@ -531,7 +531,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_offset = sb_offset; - ret = read_disk_sb(rdev); + ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; ret = -EINVAL; @@ -564,6 +564,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->sb_size = MD_SB_BYTES; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -837,6 +838,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) int ret; sector_t sb_offset; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + int bmask; /* * Calculate the position of the superblock. @@ -865,7 +867,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) } rdev->sb_offset = sb_offset; - ret = read_disk_sb(rdev); + /* superblock is rarely larger than 1K, but it can be larger, + * and it is safe to read 4k, so we do that + */ + ret = read_disk_sb(rdev, 4096); if (ret) return ret; @@ -891,6 +896,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; + bmask = block_size(rdev->bdev)-1; + if (rdev->sb_size & bmask) + rdev-> sb_size = (rdev->sb_size | bmask)+1; + if (refdev == 0) return 1; else { @@ -1375,7 +1385,7 @@ repeat: dprintk("%s ", bdevname(rdev->bdev,b)); if (!rdev->faulty) { md_super_write(mddev,rdev, - rdev->sb_offset<<1, MD_SB_BYTES, + rdev->sb_offset<<1, rdev->sb_size, rdev->sb_page); dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 8042f55dd32..ebce949b144 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -102,6 +102,7 @@ struct mdk_rdev_s int sb_loaded; sector_t data_offset; /* start of data in array */ sector_t sb_offset; + int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ /* A device can be in one of three states based on two flags: -- cgit v1.2.3-70-g09d2 From 72626685dc66d455742a7f215a0535c551628b9e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:54 -0700 Subject: [PATCH] md: add write-intent-bitmap support to raid5 Most awkward part of this is delaying write requests until bitmap updates have been flushed. To achieve this, we have a sequence number (seq_flush) which is incremented each time the raid5 is unplugged. If the raid thread notices that this has changed, it flushes bitmap changes, and assigned the value of seq_flush to seq_write. When a write request arrives, it is given the number from seq_write, and that write request may not complete until seq_flush is larger than the saved seq number. We have a new queue for storing stripes which are waiting for a bitmap flush and an extra flag for stripes to record if the write was 'degraded' and so should not clear the a bit in the bitmap. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 3 +- drivers/md/raid5.c | 133 +++++++++++++++++++++++++++++++++++++++++---- include/linux/raid/raid5.h | 14 ++++- 3 files changed, 137 insertions(+), 13 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index be7873c61b3..dbf540a7fcc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<bitmap_file == NULL) { - if (mddev->level != 1) { + if (mddev->level != 1 && mddev->level != 5) { /* FIXME use a better test */ printk(KERN_WARNING "md: bitmaps only support for raid1\n"); return -EINVAL; @@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) */ void md_write_start(mddev_t *mddev, struct bio *bi) { - DEFINE_WAIT(w); if (bio_data_dir(bi) != WRITE) return; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ed859e08d60..4683ca24c04 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -24,6 +24,8 @@ #include #include +#include + /* * Stripe cache */ @@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - else + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + conf->seq_write == sh->bm_seq) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); + } md_wakeup_thread(conf->mddev->thread); } else { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector spin_lock_irq(&conf->device_lock); do { + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector); if (!sh) { if (!conf->inactive_blocked) @@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in { struct bio **bip; raid5_conf_t *conf = sh->raid_conf; + int firstwrite=0; PRINTK("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, @@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); - if (forwrite) + if (forwrite) { bip = &sh->dev[dd_idx].towrite; - else + if (*bip == NULL && sh->dev[dd_idx].written == NULL) + firstwrite = 1; + } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) @@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector, dd_idx); + if (conf->mddev->bitmap && firstwrite) { + sh->bm_seq = conf->seq_write; + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + if (forwrite) { /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; @@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh) * need to be failed */ if (failed > 1 && to_read+to_write+written) { - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { + int bitmap_end = 0; + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) to_write--; + if (bi) { to_write--; bitmap_end = 1; } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh) /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh) bi = nextbi; } } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); } - spin_unlock_irq(&conf->device_lock); } if (failed > 1 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); @@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh) test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; + int bitmap_end = 0; PRINTK("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; @@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh) } wbi = wbi2; } + if (dev->towrite == NULL) + bitmap_end = 1; spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), 0); } } } @@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh) } } /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && (rcw == 0 ||rmw == 0)) { + if (locked == 0 && (rcw == 0 ||rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { PRINTK("Computing parity...\n"); compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); /* now every locked buffer is ready to be written */ @@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh) dev = &sh->dev[failed_num]; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); + clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); set_bit(R5_Syncio, &dev->flags); @@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_next = NULL; generic_make_request(bi); } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) } } +static inline void activate_bit_delay(raid5_conf_t *conf) +{ + /* device_lock is held */ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + list_del_init(&sh->lru); + atomic_inc(&sh->count); + __release_stripe(conf, sh); + } +} + static void unplug_slaves(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q) spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) + if (blk_remove_plug(q)) { + conf->seq_flush++; raid5_activate_delayed(conf); + } md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t first_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks-1; + sector_t max_sector = mddev->size << 1; + int sync_blocks; - if (sector_nr >= mddev->size <<1) { + if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* compelted sync */ + conf->fullsync = 0; + bitmap_close_sync(mddev->bitmap); + return 0; } /* if there is 1 or more failed drives and we are trying @@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i *skipped = 1; return rv; } + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + /* we can skip this block, and probably more */ + sync_blocks /= STRIPE_SECTORS; + *skipped = 1; + return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); } + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev) while (1) { struct list_head *first; + if (conf->seq_flush - conf->seq_write > 0) { + int seq = conf->seq_flush; + bitmap_unplug(mddev->bitmap); + conf->seq_write = seq; + activate_bit_delay(conf); + } + if (list_empty(&conf->handle_list) && atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && !blk_queue_plugged(mddev->queue) && @@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev) PRINTK("--- raid5d inactive\n"); } -static int run (mddev_t *mddev) +static int run(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; @@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev) init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + /* Ok, everything is just fine now */ + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; @@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 0; rdev->raid_disk = disk; found = 1; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; p->rdev = rdev; break; } @@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } +static void raid5_quiesce(mddev_t *mddev, int state) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: /* stop all writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 1; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes) == 0, + conf->device_lock, /* nothing */); + spin_unlock_irq(&conf->device_lock); + break; + + case 0: /* re-enable writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 0; + wake_up(&conf->wait_for_stripe); + spin_unlock_irq(&conf->device_lock); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} static mdk_personality_t raid5_personality= { .name = "raid5", @@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality= .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .quiesce = raid5_quiesce, }; static int __init raid5_init (void) diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index d63ddcb4afa..176fc653c28 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -134,6 +134,7 @@ struct stripe_head { unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ spinlock_t lock; + int bm_seq; /* sequence number for bitmap flushes */ struct r5dev { struct bio req; struct bio_vec vec; @@ -165,12 +166,13 @@ struct stripe_head { /* * Stripe state */ -#define STRIPE_ERROR 1 #define STRIPE_HANDLE 2 #define STRIPE_SYNCING 3 #define STRIPE_INSYNC 4 #define STRIPE_PREREAD_ACTIVE 5 #define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 /* * Plugging: @@ -210,10 +212,20 @@ struct raid5_private_data { struct list_head handle_list; /* stripes needing handling */ struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ char cache_name[20]; kmem_cache_t *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + /* * Free stripes pool */ -- cgit v1.2.3-70-g09d2 From 934ce7c840992a771ffc478b132092db9c935c42 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:55 -0700 Subject: [PATCH] md: write-intent bitmap support for raid6 This is a direct port of the raid5 patch. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- drivers/md/raid6main.c | 133 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 124 insertions(+), 11 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index dbf540a7fcc..008149e2bc4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<bitmap_file == NULL) { - if (mddev->level != 1 && mddev->level != 5) { + if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { /* FIXME use a better test */ printk(KERN_WARNING "md: bitmaps only support for raid1\n"); return -EINVAL; diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 09cb7272c09..267eb1430c8 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -29,6 +29,8 @@ #include #include "raid6.h" +#include + /* * Stripe cache */ @@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - else + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + conf->seq_write == sh->bm_seq) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); + } md_wakeup_thread(conf->mddev->thread); } else { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector spin_lock_irq(&conf->device_lock); do { + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector); if (!sh) { if (!conf->inactive_blocked) @@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in { struct bio **bip; raid6_conf_t *conf = sh->raid_conf; + int firstwrite=0; PRINTK("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, @@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); - if (forwrite) + if (forwrite) { bip = &sh->dev[dd_idx].towrite; - else + if (*bip == NULL && sh->dev[dd_idx].written == NULL) + firstwrite = 1; + } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) @@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector, dd_idx); + if (conf->mddev->bitmap && firstwrite) { + sh->bm_seq = conf->seq_write; + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + if (forwrite) { /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; @@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh) * need to be failed */ if (failed > 2 && to_read+to_write+written) { - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { + int bitmap_end = 0; + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) to_write--; + if (bi) { to_write--; bitmap_end = 1; } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh) /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh) bi = nextbi; } } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); } - spin_unlock_irq(&conf->device_lock); } if (failed > 2 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); @@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh) if (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ + int bitmap_end = 0; struct bio *wbi, *wbi2; PRINTK("Return write for stripe %llu disc %d\n", (unsigned long long)sh->sector, i); @@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh) } wbi = wbi2; } + if (dev->towrite == NULL) + bitmap_end = 1; spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), 0); } } } @@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh) } } /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && rcw == 0) { + if (locked == 0 && rcw == 0 && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { if ( must_compute > 0 ) { /* We have failed blocks and need to compute them */ switch ( failed ) { @@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh) bdev = &sh->dev[failed_num[1]]; locked += !test_bit(R5_LOCKED, &bdev->flags); set_bit(R5_LOCKED, &bdev->flags); + clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(R5_Wantwrite, &bdev->flags); set_bit(STRIPE_INSYNC, &sh->state); @@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_next = NULL; generic_make_request(bi); } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) } } +static inline void activate_bit_delay(raid6_conf_t *conf) +{ + /* device_lock is held */ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + list_del_init(&sh->lru); + atomic_inc(&sh->count); + __release_stripe(conf, sh); + } +} + static void unplug_slaves(mddev_t *mddev) { raid6_conf_t *conf = mddev_to_conf(mddev); @@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q) spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) + if (blk_remove_plug(q)) { + conf->seq_flush++; raid6_activate_delayed(conf); + } md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1652,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t first_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks - 2; + sector_t max_sector = mddev->size << 1; + int sync_blocks; - if (sector_nr >= mddev->size <<1) { + if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* compelted sync */ + conf->fullsync = 0; + bitmap_close_sync(mddev->bitmap); + return 0; } /* if there are 2 or more failed drives and we are trying @@ -1667,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i *skipped = 1; return rv; } + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + /* we can skip this block, and probably more */ + sync_blocks /= STRIPE_SECTORS; + *skipped = 1; + return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1684,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); } + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -1717,6 +1787,13 @@ static void raid6d (mddev_t *mddev) while (1) { struct list_head *first; + if (conf->seq_flush - conf->seq_write > 0) { + int seq = conf->seq_flush; + bitmap_unplug(mddev->bitmap); + conf->seq_write = seq; + activate_bit_delay(conf); + } + if (list_empty(&conf->handle_list) && atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && !blk_queue_plugged(mddev->queue) && @@ -1750,7 +1827,7 @@ static void raid6d (mddev_t *mddev) PRINTK("--- raid6d inactive\n"); } -static int run (mddev_t *mddev) +static int run(mddev_t *mddev) { raid6_conf_t *conf; int raid_disk, memory; @@ -1780,6 +1857,7 @@ static int run (mddev_t *mddev) init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -1899,6 +1977,9 @@ static int run (mddev_t *mddev) /* Ok, everything is just fine now */ mddev->array_size = mddev->size * (mddev->raid_disks - 2); + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + mddev->queue->unplug_fn = raid6_unplug_device; mddev->queue->issue_flush_fn = raid6_issue_flush; return 0; @@ -2076,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 0; rdev->raid_disk = disk; found = 1; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; p->rdev = rdev; break; } @@ -2105,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors) return 0; } +static void raid6_quiesce(mddev_t *mddev, int state) +{ + raid6_conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: /* stop all writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 1; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes) == 0, + conf->device_lock, /* nothing */); + spin_unlock_irq(&conf->device_lock); + break; + + case 0: /* re-enable writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 0; + wake_up(&conf->wait_for_stripe); + spin_unlock_irq(&conf->device_lock); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} static mdk_personality_t raid6_personality= { .name = "raid6", @@ -2119,6 +2231,7 @@ static mdk_personality_t raid6_personality= .spare_active = raid6_spare_active, .sync_request = sync_request, .resize = raid6_resize, + .quiesce = raid6_quiesce, }; static int __init raid6_init (void) -- cgit v1.2.3-70-g09d2 From a6fb0934f923f889055152cb0b033674f627460b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:56 -0700 Subject: [PATCH] md: use kthread infrastructure in md Switch MD to use the kthread infrastructure, to simplify the code and get rid of tasklist_lock abuse in md_unregister_thread. Also don't flush signals in md_thread, as the called thread will always do that. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 48 ++++++++++-------------------------------------- 1 file changed, 10 insertions(+), 38 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 008149e2bc4..30e3624f3d9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -3049,18 +3050,6 @@ static int md_thread(void * arg) { mdk_thread_t *thread = arg; - lock_kernel(); - - /* - * Detach thread - */ - - daemonize(thread->name, mdname(thread->mddev)); - - current->exit_signal = SIGCHLD; - allow_signal(SIGKILL); - thread->tsk = current; - /* * md_thread is a 'system-thread', it's priority should be very * high. We avoid resource deadlocks individually in each @@ -3072,14 +3061,14 @@ static int md_thread(void * arg) * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ - unlock_kernel(); complete(thread->event); - while (thread->run) { + while (!kthread_should_stop()) { void (*run)(mddev_t *); wait_event_interruptible_timeout(thread->wqueue, - test_bit(THREAD_WAKEUP, &thread->flags), + test_bit(THREAD_WAKEUP, &thread->flags) + || kthread_should_stop(), thread->timeout); try_to_freeze(); @@ -3088,11 +3077,8 @@ static int md_thread(void * arg) run = thread->run; if (run) run(thread->mddev); - - if (signal_pending(current)) - flush_signals(current); } - complete(thread->event); + return 0; } @@ -3109,11 +3095,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, const char *name) { mdk_thread_t *thread; - int ret; struct completion event; - thread = (mdk_thread_t *) kmalloc - (sizeof(mdk_thread_t), GFP_KERNEL); + thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); if (!thread) return NULL; @@ -3126,8 +3110,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->mddev = mddev; thread->name = name; thread->timeout = MAX_SCHEDULE_TIMEOUT; - ret = kernel_thread(md_thread, thread, 0); - if (ret < 0) { + thread->tsk = kthread_run(md_thread, thread, mdname(thread->mddev)); + if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; } @@ -3137,21 +3121,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { - struct completion event; - - init_completion(&event); - - thread->event = &event; - - /* As soon as ->run is set to NULL, the task could disappear, - * so we need to hold tasklist_lock until we have sent the signal - */ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); - read_lock(&tasklist_lock); - thread->run = NULL; - send_sig(SIGKILL, thread->tsk, 1); - read_unlock(&tasklist_lock); - wait_for_completion(&event); + + kthread_stop(thread->tsk); kfree(thread); } -- cgit v1.2.3-70-g09d2 From 53e87fbb5dc887766229eef3ba8bd8ab8853b066 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:58 -0700 Subject: [PATCH] md: choose better default offset for bitmap. On reflection, a better default location for hot-adding bitmaps with version-1 superblocks is immediately after the superblock. There might not be much room there, but there is usually atleast 3k, and that is a good start. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 30e3624f3d9..fd66c395894 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -957,8 +957,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->events = le64_to_cpu(sb->events); mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 0; - if (mddev->minor_version == 0) - mddev->default_bitmap_offset = -(64*1024)/512; + mddev->default_bitmap_offset = 1024; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); -- cgit v1.2.3-70-g09d2 From 720a3dc39b030e273bc955641f2517874fd38fc5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:59 -0700 Subject: [PATCH] md: use queue_hardsect_size instead of block_size for md superblock size calc. Doh. I want the physical hard-sector-size, not the current block size... Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index fd66c395894..39b917bc0fc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -898,7 +898,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->data_offset = le64_to_cpu(sb->data_offset); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = block_size(rdev->bdev)-1; + bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev-> sb_size = (rdev->sb_size | bmask)+1; -- cgit v1.2.3-70-g09d2 From 1cd6bf19bbe8098153d7a478d8fc551edbea9305 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:24:00 -0700 Subject: [PATCH] md: add information about superblock version to /proc/mdstat Leave it unchanged if the original (0.90) is used, incase it might be a compatability problem. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 39b917bc0fc..0a13016829d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3346,6 +3346,15 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n %llu blocks", (unsigned long long)size); } + if (mddev->persistent) { + if (mddev->major_version != 0 || + mddev->minor_version != 90) { + seq_printf(seq," super %d.%d", + mddev->major_version, + mddev->minor_version); + } + } else + seq_printf(seq, " super non-persistent"); if (mddev->pers) { mddev->pers->status (seq, mddev); -- cgit v1.2.3-70-g09d2 From b325a32e5732d7aef70ca3c58acb3953ed20f66c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:24:00 -0700 Subject: [PATCH] md: report spare drives in /proc/mdstat Just like failed drives have (F), so spare drives now have (S). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0a13016829d..f27e8f644df 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3334,7 +3334,8 @@ static int md_seq_show(struct seq_file *seq, void *v) if (rdev->faulty) { seq_printf(seq, "(F)"); continue; - } + } else if (rdev->raid_disk < 0) + seq_printf(seq, "(S)"); /* spare */ size += rdev->size; } -- cgit v1.2.3-70-g09d2 From 188c18fd7982d220f4dea234cbb5cc6c75d0f512 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:24:01 -0700 Subject: [PATCH] md: make sure the new 'sb_size' is set properly device added without pre-existing superblock. There are two ways to add devices to an md/raid array. It can have superblock written to it, and then given to the md driver, which will read the superblock (the new way) or md can be told (through SET_ARRAY_INFO) the shape of the array, and the told about individual drives, and md will create the required superblock (the old way). The newly introduced sb_size was only set for drives being added the new way, not the old ways. Oops :-( Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index f27e8f644df..8c1f7954882 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2303,6 +2303,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->in_sync = 0; + rdev->sb_size = MD_SB_BYTES; + if (info->state & (1<flags); -- cgit v1.2.3-70-g09d2 From 611815651b2640449bda928079160b9263efac1c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:24:02 -0700 Subject: [PATCH] md: really get sb_size setting right in all cases There was another case where sb_size wasn't being set, so instead do the sensible thing and set if when filling in the content of a superblock. That ensures that whenever we write a superblock, the sb_size MUST be set. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8c1f7954882..1554b924fbb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -712,6 +712,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; + rdev->sb_size = MD_SB_BYTES; + sb = (mdp_super_t*)page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -2303,8 +2305,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->in_sync = 0; - rdev->sb_size = MD_SB_BYTES; - if (info->state & (1<flags); -- cgit v1.2.3-70-g09d2