diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 469 |
1 files changed, 338 insertions, 131 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index b182f86a19d..dd3dfe42d5a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -39,11 +39,13 @@ #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/poll.h> #include <linux/ctype.h> +#include <linux/string.h> #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> #include <linux/reboot.h> #include <linux/file.h> +#include <linux/compat.h> #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> @@ -68,6 +70,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* + * Default number of read corrections we'll attempt on an rdev + * before ejecting it from the array. We divide the read error + * count by 2 for every hour elapsed between read errors. + */ +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that @@ -98,44 +106,40 @@ static struct ctl_table_header *raid_table_header; static ctl_table raid_table[] = { { - .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; static ctl_table raid_dir_table[] = { { - .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, - { .ctl_name = 0 } + { } }; static ctl_table raid_root_table[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = raid_dir_table, }, - { .ctl_name = 0 } + { } }; static const struct block_device_operations md_fops; @@ -217,12 +221,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) return 0; } rcu_read_lock(); - if (mddev->suspended) { + if (mddev->suspended || mddev->barrier) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!mddev->suspended && !mddev->barrier) break; rcu_read_unlock(); schedule(); @@ -264,10 +268,110 @@ static void mddev_resume(mddev_t *mddev) int mddev_congested(mddev_t *mddev, int bits) { + if (mddev->barrier) + return 1; return mddev->suspended; } EXPORT_SYMBOL(mddev_congested); +/* + * Generic barrier handling for md + */ + +#define POST_REQUEST_BARRIER ((void*)1) + +static void md_end_barrier(struct bio *bio, int err) +{ + mdk_rdev_t *rdev = bio->bi_private; + mddev_t *mddev = rdev->mddev; + if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) + set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) { + if (mddev->barrier == POST_REQUEST_BARRIER) { + /* This was a post-request barrier */ + mddev->barrier = NULL; + wake_up(&mddev->sb_wait); + } else + /* The pre-request barrier has finished */ + schedule_work(&mddev->barrier_work); + } + bio_put(bio); +} + +static void submit_barriers(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + rcu_read_lock(); + list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* Take two references, one is dropped + * when request finishes, one after + * we reclaim rcu_read_lock + */ + struct bio *bi; + atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + bi = bio_alloc(GFP_KERNEL, 0); + bi->bi_end_io = md_end_barrier; + bi->bi_private = rdev; + bi->bi_bdev = rdev->bdev; + atomic_inc(&mddev->flush_pending); + submit_bio(WRITE_BARRIER, bi); + rcu_read_lock(); + rdev_dec_pending(rdev, mddev); + } + rcu_read_unlock(); +} + +static void md_submit_barrier(struct work_struct *ws) +{ + mddev_t *mddev = container_of(ws, mddev_t, barrier_work); + struct bio *bio = mddev->barrier; + + atomic_set(&mddev->flush_pending, 1); + + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) + bio_endio(bio, -EOPNOTSUPP); + else if (bio->bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio, 0); + else { + bio->bi_rw &= ~(1<<BIO_RW_BARRIER); + if (mddev->pers->make_request(mddev->queue, bio)) + generic_make_request(bio); + mddev->barrier = POST_REQUEST_BARRIER; + submit_barriers(mddev); + } + if (atomic_dec_and_test(&mddev->flush_pending)) { + mddev->barrier = NULL; + wake_up(&mddev->sb_wait); + } +} + +void md_barrier_request(mddev_t *mddev, struct bio *bio) +{ + spin_lock_irq(&mddev->write_lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->barrier, + mddev->write_lock, /*nothing*/); + mddev->barrier = bio; + spin_unlock_irq(&mddev->write_lock); + + atomic_set(&mddev->flush_pending, 1); + INIT_WORK(&mddev->barrier_work, md_submit_barrier); + + submit_barriers(mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) + schedule_work(&mddev->barrier_work); +} +EXPORT_SYMBOL(md_barrier_request); static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -282,7 +386,9 @@ static void mddev_put(mddev_t *mddev) if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && - !mddev->hold_active) { + mddev->ctime == 0 && !mddev->hold_active) { + /* Array is not configured at all, and not held active, + * so destroy it */ list_del(&mddev->all_mddevs); if (mddev->gendisk) { /* we did a probe so need to clean up. @@ -367,6 +473,7 @@ static mddev_t * mddev_find(dev_t unit) mutex_init(&new->open_mutex); mutex_init(&new->reconfig_mutex); + mutex_init(&new->bitmap_info.mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); @@ -374,6 +481,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->openers, 0); atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); + atomic_set(&new->flush_pending, 0); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; @@ -752,7 +860,7 @@ struct super_type { */ int md_check_no_bitmap(mddev_t *mddev) { - if (!mddev->bitmap_file && !mddev->bitmap_offset) + if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; printk(KERN_ERR "%s: bitmaps are not supported for %s\n", mdname(mddev), mddev->pers->name); @@ -880,8 +988,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->raid_disks = sb->raid_disks; mddev->dev_sectors = sb->size * 2; mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; @@ -915,8 +1023,9 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_file == NULL) - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.file == NULL) + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ @@ -1033,7 +1142,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; - if (mddev->bitmap && mddev->bitmap_file == NULL) + if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); @@ -1111,7 +1220,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ - if (rdev->mddev->bitmap_offset) + if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ rdev->sb_start = calc_dev_sboffset(rdev->bdev); if (!num_sectors || num_sectors > rdev->sb_start) @@ -1290,8 +1399,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = 1024 >> 9; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 1024 >> 9; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); @@ -1299,8 +1408,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_file == NULL ) - mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); + mddev->bitmap_info.file == NULL ) + mddev->bitmap_info.offset = + (__s32)le32_to_cpu(sb->bitmap_offset); if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); @@ -1394,19 +1504,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); - if (mddev->bitmap && mddev->bitmap_file == NULL) { - sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags)) { - if (rdev->recovery_offset > 0) { - sb->feature_map |= - cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = - cpu_to_le64(rdev->recovery_offset); - } + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); } if (mddev->reshape_position != MaxSector) { @@ -1440,7 +1548,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); @@ -1462,7 +1570,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) max_sectors -= rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - } else if (rdev->mddev->bitmap_offset) { + } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { @@ -1830,15 +1938,11 @@ static void print_sb_1(struct mdp_superblock_1 *sb) uuid = sb->set_uuid; printk(KERN_INFO - "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" - ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" "md: Name: \"%s\" CT:%llu\n", le32_to_cpu(sb->major_version), le32_to_cpu(sb->feature_map), - uuid[0], uuid[1], uuid[2], uuid[3], - uuid[4], uuid[5], uuid[6], uuid[7], - uuid[8], uuid[9], uuid[10], uuid[11], - uuid[12], uuid[13], uuid[14], uuid[15], + uuid, sb->set_name, (unsigned long long)le64_to_cpu(sb->ctime) & MD_SUPERBLOCK_1_TIME_SEC_MASK); @@ -1847,8 +1951,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" " RO:%llu\n" - "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" - ":%02x%02x%02x%02x%02x%02x\n" + "md: Dev:%08x UUID: %pU\n" "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" "md: (MaxDev:%u) \n", le32_to_cpu(sb->level), @@ -1861,10 +1964,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) (unsigned long long)le64_to_cpu(sb->super_offset), (unsigned long long)le64_to_cpu(sb->recovery_offset), le32_to_cpu(sb->dev_number), - uuid[0], uuid[1], uuid[2], uuid[3], - uuid[4], uuid[5], uuid[6], uuid[7], - uuid[8], uuid[9], uuid[10], uuid[11], - uuid[12], uuid[13], uuid[14], uuid[15], + uuid, sb->devflags, (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, (unsigned long long)le64_to_cpu(sb->events), @@ -2446,12 +2546,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); + +static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) +{ + unsigned long long recovery_start = rdev->recovery_offset; + + if (test_bit(In_sync, &rdev->flags) || + recovery_start == MaxSector) + return sprintf(page, "none\n"); + + return sprintf(page, "%llu\n", recovery_start); +} + +static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + unsigned long long recovery_start; + + if (cmd_match(buf, "none")) + recovery_start = MaxSector; + else if (strict_strtoull(buf, 10, &recovery_start)) + return -EINVAL; + + if (rdev->mddev->pers && + rdev->raid_disk >= 0) + return -EBUSY; + + rdev->recovery_offset = recovery_start; + if (recovery_start == MaxSector) + set_bit(In_sync, &rdev->flags); + else + clear_bit(In_sync, &rdev->flags); + return len; +} + +static struct rdev_sysfs_entry rdev_recovery_start = +__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, &rdev_size.attr, + &rdev_recovery_start.attr, NULL, }; static ssize_t @@ -2553,6 +2690,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->flags = 0; rdev->data_offset = 0; rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); @@ -2663,6 +2802,47 @@ static void analyze_sbs(mddev_t * mddev) } } +/* Read a fixed-point number. + * Numbers in sysfs attributes should be in "standard" units where + * possible, so time should be in seconds. + * However we internally use a a much smaller unit such as + * milliseconds or jiffies. + * This function takes a decimal number with a possible fractional + * component, and produces an integer which is the result of + * multiplying that number by 10^'scale'. + * all without any floating-point arithmetic. + */ +int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) +{ + unsigned long result = 0; + long decimals = -1; + while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { + if (*cp == '.') + decimals = 0; + else if (decimals < scale) { + unsigned int value; + value = *cp - '0'; + result = result * 10 + value; + if (decimals >= 0) + decimals++; + } + cp++; + } + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + if (decimals < 0) + decimals = 0; + while (decimals < scale) { + result *= 10; + decimals ++; + } + *res = result; + return 0; +} + + static void md_safemode_timeout(unsigned long data); static ssize_t @@ -2674,31 +2854,10 @@ safe_delay_show(mddev_t *mddev, char *page) static ssize_t safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) { - int scale=1; - int dot=0; - int i; unsigned long msec; - char buf[30]; - /* remove a period, and count digits after it */ - if (len >= sizeof(buf)) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; - strlcpy(buf, cbuf, sizeof(buf)); - for (i=0; i<len; i++) { - if (dot) { - if (isdigit(buf[i])) { - buf[i-1] = buf[i]; - scale *= 10; - } - buf[i] = 0; - } else if (buf[i] == '.') { - dot=1; - buf[i] = 0; - } - } - if (strict_strtoul(buf, 10, &msec) < 0) - return -EINVAL; - msec = (msec * 1000) / scale; if (msec == 0) mddev->safemode_delay = 0; else { @@ -2974,7 +3133,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) return -EBUSY; - if (!*buf || (*e && *e != '\n')) + if (cmd_match(buf, "none")) + n = MaxSector; + else if (!*buf || (*e && *e != '\n')) return -EINVAL; mddev->recovery_cp = n; @@ -3170,6 +3331,29 @@ static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t +max_corrected_read_errors_show(mddev_t *mddev, char *page) { + return sprintf(page, "%d\n", + atomic_read(&mddev->max_corr_read_errors)); +} + +static ssize_t +max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&mddev->max_corr_read_errors, n); + return len; + } + return -EINVAL; +} + +static struct md_sysfs_entry max_corr_read_errors = +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, + max_corrected_read_errors_store); + +static ssize_t null_show(mddev_t *mddev, char *page) { return -EINVAL; @@ -3250,8 +3434,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len) } if (*end && !isspace(*end)) break; bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); - buf = end; - while (isspace(*buf)) buf++; + buf = skip_spaces(end); } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: @@ -3794,6 +3977,7 @@ static struct attribute *md_default_attrs[] = { &md_array_state.attr, &md_reshape_position.attr, &md_array_size.attr, + &max_corr_read_errors.attr, NULL, }; @@ -3898,6 +4082,7 @@ static void mddev_delayed_delete(struct work_struct *ws) mddev->sysfs_action = NULL; mddev->private = NULL; } + sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -3989,6 +4174,8 @@ static int md_alloc(dev_t dev, char *name) disk->disk_name); error = 0; } + if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + printk(KERN_DEBUG "pointless warning\n"); abort: mutex_unlock(&disks_mutex); if (!error) { @@ -4170,7 +4357,7 @@ static int do_md_run(mddev_t * mddev) mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; - if (start_readonly) + if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ err = mddev->pers->run(mddev); @@ -4210,6 +4397,8 @@ static int do_md_run(mddev_t * mddev) mddev->ro = 0; atomic_set(&mddev->writes_pending,0); + atomic_set(&mddev->max_corr_read_errors, + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; @@ -4232,33 +4421,6 @@ static int do_md_run(mddev_t * mddev) set_capacity(disk, mddev->array_sectors); - /* If there is a partially-recovered drive we need to - * start recovery here. If we leave it to md_check_recovery, - * it will remove the drives and not do the right thing - */ - if (mddev->degraded && !mddev->sync_thread) { - int spares = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) - /* complete an interrupted recovery */ - spares++; - if (spares && mddev->pers->sync_request) { - mddev->recovery = 0; - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); - if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); - /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; - } - } - } md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ @@ -4314,7 +4476,7 @@ static int deny_bitmap_write_access(struct file * file) return 0; } -static void restore_bitmap_write_access(struct file *file) +void restore_bitmap_write_access(struct file *file) { struct inode *inode = file->f_mapping->host; @@ -4409,12 +4571,12 @@ out: printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + if (mddev->bitmap_info.file) { + restore_bitmap_write_access(mddev->bitmap_info.file); + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; } - mddev->bitmap_offset = 0; + mddev->bitmap_info.offset = 0; /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); @@ -4455,6 +4617,11 @@ out: mddev->degraded = 0; mddev->barriers_work = 0; mddev->safemode = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; @@ -4640,7 +4807,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); - if (mddev->bitmap && mddev->bitmap_offset) + if (mddev->bitmap && mddev->bitmap_info.offset) info.state = (1<<MD_SB_BITMAP_PRESENT); info.active_disks = insync; info.working_disks = working; @@ -4998,23 +5165,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd) if (fd >= 0) { if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_file = fget(fd); + mddev->bitmap_info.file = fget(fd); - if (mddev->bitmap_file == NULL) { + if (mddev->bitmap_info.file == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_file); + err = deny_bitmap_write_access(mddev->bitmap_info.file); if (err) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; return err; } - mddev->bitmap_offset = 0; /* file overrides offset */ + mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; @@ -5029,11 +5196,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); + if (mddev->bitmap_info.file) { + restore_bitmap_write_access(mddev->bitmap_info.file); + fput(mddev->bitmap_info.file); } - mddev->bitmap_file = NULL; + mddev->bitmap_info.file = NULL; } return err; @@ -5070,6 +5237,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->minor_version = info->minor_version; mddev->patch_version = info->patch_version; mddev->persistent = !info->not_persistent; + /* ensure mddev_put doesn't delete this now that there + * is some minimal configuration. + */ + mddev->ctime = get_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; @@ -5100,8 +5271,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; - mddev->bitmap_offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; @@ -5201,7 +5372,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) int state = 0; /* calculate expected state,ignoring low bits */ - if (mddev->bitmap && mddev->bitmap_offset) + if (mddev->bitmap && mddev->bitmap_info.offset) state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || @@ -5260,9 +5431,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* add the bitmap */ if (mddev->bitmap) return -EEXIST; - if (mddev->default_bitmap_offset == 0) + if (mddev->bitmap_info.default_offset == 0) return -EINVAL; - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); if (rv) @@ -5277,7 +5449,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); - mddev->bitmap_offset = 0; + mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); @@ -5528,6 +5700,25 @@ done: abort: return err; } +#ifdef CONFIG_COMPAT +static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* These take in integer arg, do not convert */ + break; + default: + arg = (unsigned long)compat_ptr(arg); + break; + } + + return md_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ static int md_open(struct block_device *bdev, fmode_t mode) { @@ -5593,6 +5784,9 @@ static const struct block_device_operations md_fops = .open = md_open, .release = md_release, .ioctl = md_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = md_compat_ioctl, +#endif .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, @@ -5986,14 +6180,14 @@ static int md_seq_show(struct seq_file *seq, void *v) unsigned long chunk_kb; unsigned long flags; spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = bitmap->chunksize >> 10; + chunk_kb = mddev->bitmap_info.chunksize >> 10; seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " "%lu%s chunk", bitmap->pages - bitmap->missing_pages, bitmap->pages, (bitmap->pages - bitmap->missing_pages) << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->chunksize, + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, chunk_kb ? "KB" : "B"); if (bitmap->file) { seq_printf(seq, ", file: "); @@ -6279,10 +6473,11 @@ void md_do_sync(mddev_t *mddev) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) { + if (kthread_should_stop()) set_bit(MD_RECOVERY_INTR, &mddev->recovery); + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - } for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; @@ -6342,12 +6537,14 @@ void md_do_sync(mddev_t *mddev) /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; - list_for_each_entry(rdev, &mddev->disks, same_set) + rcu_read_lock(); + list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; + rcu_read_unlock(); } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -6384,6 +6581,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } + mddev->curr_resync_completed = mddev->curr_resync; while (j < max_sectors) { sector_t sectors; @@ -6516,22 +6714,29 @@ void md_do_sync(mddev_t *mddev) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - list_for_each_entry(rdev, &mddev->disks, same_set) + rcu_read_lock(); + list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); } } set_bit(MD_CHANGE_DEVS, &mddev->flags); skip: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* We completed so min/max setting can be forgotten if used. */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; - mddev->curr_resync_completed = 0; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - /* We completed so max setting can be forgotten. */ - mddev->resync_max = MaxSector; + mddev->curr_resync_completed = 0; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); @@ -6594,6 +6799,7 @@ static int remove_and_add_spares(mddev_t *mddev) nm, mdname(mddev)); spares++; md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } else break; } @@ -6629,7 +6835,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->bitmap) - bitmap_daemon_work(mddev->bitmap); + bitmap_daemon_work(mddev); if (mddev->ro) return; @@ -6999,5 +7205,6 @@ EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |