diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 10 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-region-hash.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 114 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 113 | ||||
-rw-r--r-- | drivers/md/md.h | 28 | ||||
-rw-r--r-- | drivers/md/multipath.c | 60 | ||||
-rw-r--r-- | drivers/md/multipath.h | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 535 | ||||
-rw-r--r-- | drivers/md/raid1.h | 4 | ||||
-rw-r--r-- | drivers/md/raid10.c | 457 | ||||
-rw-r--r-- | drivers/md/raid10.h | 4 | ||||
-rw-r--r-- | drivers/md/raid5.c | 109 | ||||
-rw-r--r-- | drivers/md/raid5.h | 2 |
16 files changed, 734 insertions, 717 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5c9362792f1..70bd738b8b9 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); - if (bitmap->mddev->events < bitmap->events_cleared) { + if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - } + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + sb->state = cpu_to_le32(bitmap->flags); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); @@ -618,7 +618,7 @@ success: if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) bitmap->flags |= BITMAP_HOSTENDIAN; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (sb->state & cpu_to_le32(BITMAP_STALE)) + if (bitmap->flags & BITMAP_STALE) bitmap->events_cleared = bitmap->mddev->events; err = 0; out: @@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, switch (op) { case MASK_SET: sb->state |= cpu_to_le32(bits); + bitmap->flags |= bits; break; case MASK_UNSET: sb->state &= cpu_to_le32(~bits); + bitmap->flags &= ~bits; break; default: BUG(); diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 931a7a7c379..d0aeaf46d93 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -45,7 +45,7 @@ * * The counter counts pending write requests, plus the on-disk bit. * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared aswell, thus setting the counter to 0. + * bit can be cleared as well, thus setting the counter to 0. * When we set a bit, or in the counter (to start a write), if the fields is * 0, we first set the disk bit and set the counter to 1. * diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5ef136cdba9..e5d8904fc8f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -390,13 +390,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) return md_raid5_congested(&rs->md, bits); } -static void raid_unplug(struct dm_target_callbacks *cb) -{ - struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - - md_raid5_kick_device(rs->md.private); -} - /* * Construct a RAID4/5/6 mapping: * Args: @@ -487,7 +480,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) } rs->callbacks.congested_fn = raid_is_congested; - rs->callbacks.unplug_fn = raid_unplug; dm_table_add_target_callbacks(ti->table, &rs->callbacks); return 0; diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index dad011aed0c..7771ed21218 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) /* * Possible cases: * 1) DM_RH_DIRTY - * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed + * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed * 3) DM_RH_RECOVERING: flushing pending writes * Either case, the region should have not been connected to list. */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 416d4e258df..cb8380c9767 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -927,20 +927,80 @@ static int dm_table_build_index(struct dm_table *t) } /* + * Get a disk whose integrity profile reflects the table's profile. + * If %match_all is true, all devices' profiles must match. + * If %match_all is false, all devices must at least have an + * allocated integrity profile; but uninitialized is ok. + * Returns NULL if integrity support was inconsistent or unavailable. + */ +static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, + bool match_all) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd = NULL; + struct gendisk *prev_disk = NULL, *template_disk = NULL; + + list_for_each_entry(dd, devices, list) { + template_disk = dd->dm_dev.bdev->bd_disk; + if (!blk_get_integrity(template_disk)) + goto no_integrity; + if (!match_all && !blk_integrity_is_initialized(template_disk)) + continue; /* skip uninitialized profiles */ + else if (prev_disk && + blk_integrity_compare(prev_disk, template_disk) < 0) + goto no_integrity; + prev_disk = template_disk; + } + + return template_disk; + +no_integrity: + if (prev_disk) + DMWARN("%s: integrity not set: %s and %s profile mismatch", + dm_device_name(t->md), + prev_disk->disk_name, + template_disk->disk_name); + return NULL; +} + +/* * Register the mapped device for blk_integrity support if - * the underlying devices support it. + * the underlying devices have an integrity profile. But all devices + * may not have matching profiles (checking all devices isn't reliable + * during table load because this table may use other DM device(s) which + * must be resumed before they will have an initialized integity profile). + * Stacked DM devices force a 2 stage integrity profile validation: + * 1 - during load, validate all initialized integrity profiles match + * 2 - during resume, validate all integrity profiles match */ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) { - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd; + struct gendisk *template_disk = NULL; - list_for_each_entry(dd, devices, list) - if (bdev_get_integrity(dd->dm_dev.bdev)) { - t->integrity_supported = 1; - return blk_integrity_register(dm_disk(md), NULL); - } + template_disk = dm_table_get_integrity_disk(t, false); + if (!template_disk) + return 0; + if (!blk_integrity_is_initialized(dm_disk(md))) { + t->integrity_supported = 1; + return blk_integrity_register(dm_disk(md), NULL); + } + + /* + * If DM device already has an initalized integrity + * profile the new profile should not conflict. + */ + if (blk_integrity_is_initialized(template_disk) && + blk_integrity_compare(dm_disk(md), template_disk) < 0) { + DMWARN("%s: conflict with existing integrity profile: " + "%s profile mismatch", + dm_device_name(t->md), + template_disk->disk_name); + return 1; + } + + /* Preserve existing initialized integrity profile */ + t->integrity_supported = 1; return 0; } @@ -1094,41 +1154,27 @@ combine_limits: /* * Set the integrity profile for this device if all devices used have - * matching profiles. + * matching profiles. We're quite deep in the resume path but still + * don't know if all devices (particularly DM devices this device + * may be stacked on) have matching profiles. Even if the profiles + * don't match we have no way to fail (to resume) at this point. */ static void dm_table_set_integrity(struct dm_table *t) { - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *prev = NULL, *dd = NULL; + struct gendisk *template_disk = NULL; if (!blk_get_integrity(dm_disk(t->md))) return; - list_for_each_entry(dd, devices, list) { - if (prev && - blk_integrity_compare(prev->dm_dev.bdev->bd_disk, - dd->dm_dev.bdev->bd_disk) < 0) { - DMWARN("%s: integrity not set: %s and %s mismatch", - dm_device_name(t->md), - prev->dm_dev.bdev->bd_disk->disk_name, - dd->dm_dev.bdev->bd_disk->disk_name); - goto no_integrity; - } - prev = dd; + template_disk = dm_table_get_integrity_disk(t, true); + if (!template_disk && + blk_integrity_is_initialized(dm_disk(t->md))) { + DMWARN("%s: device no longer has a valid integrity profile", + dm_device_name(t->md)); + return; } - - if (!prev || !bdev_get_integrity(prev->dm_dev.bdev)) - goto no_integrity; - blk_integrity_register(dm_disk(t->md), - bdev_get_integrity(prev->dm_dev.bdev)); - - return; - -no_integrity: - blk_integrity_register(dm_disk(t->md), NULL); - - return; + blk_get_integrity(template_disk)); } void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 339fdc67075..23078dabb6d 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -30,7 +30,7 @@ * * Different modes can be active at a time, but only * one can be set at array creation. Others can be added later. - * A mode can be one-shot or recurrent with the recurrance being + * A mode can be one-shot or recurrent with the recurrence being * once in every N requests. * The bottom 5 bits of the "layout" indicate the mode. The * remainder indicate a period, or 0 for one-shot. diff --git a/drivers/md/md.c b/drivers/md/md.c index 8b66e04c2ea..aa640a85bb2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -447,48 +447,59 @@ EXPORT_SYMBOL(md_flush_request); /* Support for plugging. * This mirrors the plugging support in request_queue, but does not - * require having a whole queue + * require having a whole queue or request structures. + * We allocate an md_plug_cb for each md device and each thread it gets + * plugged on. This links tot the private plug_handle structure in the + * personality data where we keep a count of the number of outstanding + * plugs so other code can see if a plug is active. */ -static void plugger_work(struct work_struct *work) -{ - struct plug_handle *plug = - container_of(work, struct plug_handle, unplug_work); - plug->unplug_fn(plug); -} -static void plugger_timeout(unsigned long data) -{ - struct plug_handle *plug = (void *)data; - kblockd_schedule_work(NULL, &plug->unplug_work); -} -void plugger_init(struct plug_handle *plug, - void (*unplug_fn)(struct plug_handle *)) -{ - plug->unplug_flag = 0; - plug->unplug_fn = unplug_fn; - init_timer(&plug->unplug_timer); - plug->unplug_timer.function = plugger_timeout; - plug->unplug_timer.data = (unsigned long)plug; - INIT_WORK(&plug->unplug_work, plugger_work); -} -EXPORT_SYMBOL_GPL(plugger_init); +struct md_plug_cb { + struct blk_plug_cb cb; + mddev_t *mddev; +}; -void plugger_set_plug(struct plug_handle *plug) +static void plugger_unplug(struct blk_plug_cb *cb) { - if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) - mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); + struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); + if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) + md_wakeup_thread(mdcb->mddev->thread); + kfree(mdcb); } -EXPORT_SYMBOL_GPL(plugger_set_plug); -int plugger_remove_plug(struct plug_handle *plug) +/* Check that an unplug wakeup will come shortly. + * If not, wakeup the md thread immediately + */ +int mddev_check_plugged(mddev_t *mddev) { - if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { - del_timer(&plug->unplug_timer); - return 1; - } else + struct blk_plug *plug = current->plug; + struct md_plug_cb *mdcb; + + if (!plug) return 0; -} -EXPORT_SYMBOL_GPL(plugger_remove_plug); + list_for_each_entry(mdcb, &plug->cb_list, cb.list) { + if (mdcb->cb.callback == plugger_unplug && + mdcb->mddev == mddev) { + /* Already on the list, move to top */ + if (mdcb != list_first_entry(&plug->cb_list, + struct md_plug_cb, + cb.list)) + list_move(&mdcb->cb.list, &plug->cb_list); + return 1; + } + } + /* Not currently on the callback list */ + mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); + if (!mdcb) + return 0; + + mdcb->mddev = mddev; + mdcb->cb.callback = plugger_unplug; + atomic_inc(&mddev->plug_cnt); + list_add(&mdcb->cb.list, &plug->cb_list); + return 1; +} +EXPORT_SYMBOL_GPL(mddev_check_plugged); static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -538,6 +549,7 @@ void mddev_init(mddev_t *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); + atomic_set(&mddev->plug_cnt, 0); spin_lock_init(&mddev->write_lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -3158,6 +3170,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; + mddev->degraded = 0; if (mddev->pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync @@ -3311,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long long n = simple_strtoull(buf, &e, 10); - if (mddev->pers) + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return -EBUSY; if (cmd_match(buf, "none")) n = MaxSector; @@ -4334,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. */ disk->flags |= GENHD_FL_EXT_DEVT; - add_disk(disk); mddev->gendisk = disk; + /* As soon as we call add_disk(), another thread could get + * through to md_open, so make sure it doesn't get too far + */ + mutex_lock(&mddev->open_mutex); + add_disk(disk); + error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { @@ -4354,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); - - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { @@ -4723,7 +4741,6 @@ static void md_clean(mddev_t *mddev) mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; - mddev->plug = NULL; } static void __md_stop_writes(mddev_t *mddev) @@ -5199,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) } else super_types[mddev->major_version]. validate_super(mddev, rdev); + if ((info->state & (1<<MD_DISK_SYNC)) && + (!test_bit(In_sync, &rdev->flags) || + rdev->raid_disk != info->raid_disk)) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } + if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = rdev->raid_disk; else @@ -6266,7 +6293,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) * rt is a sector_t, so could be 32bit or 64bit. * So we divide before multiply in case it is 32bit and close * to the limit. - * We scale the divisor (db) by 32 to avoid loosing precision + * We scale the divisor (db) by 32 to avoid losing precision * near the end of resync when the number of remaining sectors * is close to 'db'. * We then divide rt by 32 after multiplying by db to compensate. @@ -6688,12 +6715,6 @@ int md_allow_write(mddev_t *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); -void md_unplug(mddev_t *mddev) -{ - if (mddev->plug) - mddev->plug->unplug_fn(mddev->plug); -} - #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) void md_do_sync(mddev_t *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 12215d437fc..0b1fd3f1d85 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -29,26 +29,6 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; -/* generic plugging support - like that provided with request_queue, - * but does not require a request_queue - */ -struct plug_handle { - void (*unplug_fn)(struct plug_handle *); - struct timer_list unplug_timer; - struct work_struct unplug_work; - unsigned long unplug_flag; -}; -#define PLUGGED_FLAG 1 -void plugger_init(struct plug_handle *plug, - void (*unplug_fn)(struct plug_handle *)); -void plugger_set_plug(struct plug_handle *plug); -int plugger_remove_plug(struct plug_handle *plug); -static inline void plugger_flush(struct plug_handle *plug) -{ - del_timer_sync(&plug->unplug_timer); - cancel_work_sync(&plug->unplug_work); -} - /* * MD's 'extended' device */ @@ -94,7 +74,7 @@ struct mdk_rdev_s #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ #define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occured on an externally +#define Blocked 8 /* An error occurred on an externally * managed array, don't allow writes * until it is cleared */ wait_queue_head_t blocked_wait; @@ -199,6 +179,9 @@ struct mddev_s int delta_disks, new_level, new_layout; int new_chunk_sectors; + atomic_t plug_cnt; /* If device is expecting + * more bios soon. + */ struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* last block scheduled */ @@ -336,7 +319,6 @@ struct mddev_s struct list_head all_mddevs; struct attribute_group *to_remove; - struct plug_handle *plug; /* if used by personality */ struct bio_set *bio_set; @@ -516,7 +498,6 @@ extern int md_integrity_register(mddev_t *mddev); extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void restore_bitmap_write_access(struct file *file); -extern void md_unplug(mddev_t *mddev); extern void mddev_init(mddev_t *mddev); extern int md_run(mddev_t *mddev); @@ -530,4 +511,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, mddev_t *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, mddev_t *mddev); +extern int mddev_check_plugged(mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c3589099098..3535c23af28 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->multipaths[i].rdev && @@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits) static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) { multipath_conf_t *conf = mddev->private; + char b[BDEVNAME_SIZE]; - if (conf->working_disks <= 1) { + if (conf->raid_disks - mddev->degraded <= 1) { /* * Uh oh, we can do nothing if this is our last path, but * first check if this is a queued request for a device * which has just failed. */ printk(KERN_ALERT - "multipath: only one IO path left and IO error.\n"); + "multipath: only one IO path left and IO error.\n"); /* leave it active... it's all we have */ - } else { - /* - * Mark disk as unusable - */ - if (!test_bit(Faulty, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - clear_bit(In_sync, &rdev->flags); - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - conf->working_disks--; - mddev->degraded++; - printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path.\n" - "multipath: Operation continuing" - " on %d IO paths.\n", - bdevname (rdev->bdev,b), - conf->working_disks); - } + return; + } + /* + * Mark disk as unusable + */ + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); } + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "multipath: IO failure on %s," + " disabling IO path.\n" + "multipath: Operation continuing" + " on %d IO paths.\n", + bdevname(rdev->bdev, b), + conf->raid_disks - mddev->degraded); } static void print_multipath_conf (multipath_conf_t *conf) @@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf) printk("(conf==NULL)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { @@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) PAGE_CACHE_SIZE - 1); } - conf->working_disks++; + spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); + spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; md_integrity_add_rdev(rdev, mddev); @@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev) int disk_idx; struct multipath_info *disk; mdk_rdev_t *rdev; + int working_disks; if (md_check_no_bitmap(mddev)) return -EINVAL; @@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev) goto out_free_conf; } - conf->working_disks = 0; + working_disks = 0; list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || @@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev) } if (!test_bit(Faulty, &rdev->flags)) - conf->working_disks++; + working_disks++; } conf->raid_disks = mddev->raid_disks; @@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev) spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (!conf->working_disks) { + if (!working_disks) { printk(KERN_ERR "multipath: no operational IO paths for %s\n", mdname(mddev)); goto out_free_conf; } - mddev->degraded = conf->raid_disks - conf->working_disks; + mddev->degraded = conf->raid_disks - working_disks; conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); @@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev) printk(KERN_INFO "multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->working_disks, mddev->raid_disks); + mdname(mddev), conf->raid_disks - mddev->degraded, + mddev->raid_disks); /* * Ok, everything is just fine now */ diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index d1c2a8d7839..3c5a45eb5f8 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h @@ -9,7 +9,6 @@ struct multipath_private_data { mddev_t *mddev; struct multipath_info *multipaths; int raid_disks; - int working_disks; spinlock_t device_lock; struct list_head retry_list; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c2a21ae56d9..5d096096f95 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error) rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } -static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, - int behind) +static void r1_bio_write_done(r1bio_t *r1_bio) { if (atomic_dec_and_test(&r1_bio->remaining)) { /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { /* free extra copy of the data pages */ - int i = vcnt; + int i = r1_bio->behind_page_count; while (i--) - safe_put_page(bv[i].bv_page); + safe_put_page(r1_bio->behind_pages[i]); + kfree(r1_bio->behind_pages); + r1_bio->behind_pages = NULL; } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); + test_bit(R1BIO_BehindIO, &r1_bio->state)); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error) * Let's see if all mirrored write operations have finished * already. */ - r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); + r1_bio_write_done(r1_bio); if (to_put) bio_put(to_put); @@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const sector_t this_sector = r1_bio->sector; const int sectors = r1_bio->sectors; - int new_disk = -1; int start_disk; + int best_disk; int i; - sector_t new_distance, current_distance; + sector_t best_dist; mdk_rdev_t *rdev; int choose_first; @@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) * We take the first readable disk when above the resync window. */ retry: + best_disk = -1; + best_dist = MaxSector; if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { choose_first = 1; @@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) start_disk = conf->last_used; } - /* make sure the disk is operational */ for (i = 0 ; i < conf->raid_disks ; i++) { + sector_t dist; int disk = start_disk + i; if (disk >= conf->raid_disks) disk -= conf->raid_disks; @@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL - || !test_bit(In_sync, &rdev->flags)) + || test_bit(Faulty, &rdev->flags)) continue; - - new_disk = disk; - if (!test_bit(WriteMostly, &rdev->flags)) - break; - } - - if (new_disk < 0 || choose_first) - goto rb_out; - - /* - * Don't change to another disk for sequential reads: - */ - if (conf->next_seq_sect == this_sector) - goto rb_out; - if (this_sector == conf->mirrors[new_disk].head_position) - goto rb_out; - - current_distance = abs(this_sector - - conf->mirrors[new_disk].head_position); - - /* look for a better disk - i.e. head is closer */ - start_disk = new_disk; - for (i = 1; i < conf->raid_disks; i++) { - int disk = start_disk + 1; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; - - rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || !test_bit(In_sync, &rdev->flags) - || test_bit(WriteMostly, &rdev->flags)) + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < this_sector + sectors) continue; - - if (!atomic_read(&rdev->nr_pending)) { - new_disk = disk; + if (test_bit(WriteMostly, &rdev->flags)) { + /* Don't balance among write-mostly, just + * use the first as a last resort */ + if (best_disk < 0) + best_disk = disk; + continue; + } + /* This is a reasonable device to use. It might + * even be best. + */ + dist = abs(this_sector - conf->mirrors[disk].head_position); + if (choose_first + /* Don't change to another disk for sequential reads */ + || conf->next_seq_sect == this_sector + || dist == 0 + /* If device is idle, use it */ + || atomic_read(&rdev->nr_pending) == 0) { + best_disk = disk; break; } - new_distance = abs(this_sector - conf->mirrors[disk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - new_disk = disk; + if (dist < best_dist) { + best_dist = dist; + best_disk = disk; } } - rb_out: - if (new_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[new_disk].rdev); + if (best_disk >= 0) { + rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (!test_bit(In_sync, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags)) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ @@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) goto retry; } conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; + conf->last_used = best_disk; } rcu_read_unlock(); - return new_disk; + return best_disk; } static int raid1_congested(void *data, int bits) @@ -565,12 +551,6 @@ static void flush_pending_writes(conf_t *conf) spin_unlock_irq(&conf->device_lock); } -static void md_kick_device(mddev_t *mddev) -{ - blk_flush_plug(current); - md_wakeup_thread(mddev->thread); -} - /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -600,7 +580,7 @@ static void raise_barrier(conf_t *conf) /* Wait until no block IO is waiting */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, - conf->resync_lock, md_kick_device(conf->mddev)); + conf->resync_lock, ); /* block any new IO from starting */ conf->barrier++; @@ -608,7 +588,7 @@ static void raise_barrier(conf_t *conf) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, md_kick_device(conf->mddev)); + conf->resync_lock, ); spin_unlock_irq(&conf->resync_lock); } @@ -630,7 +610,7 @@ static void wait_barrier(conf_t *conf) conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, - md_kick_device(conf->mddev)); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -666,8 +646,7 @@ static void freeze_array(conf_t *conf) wait_event_lock_irq(conf->wait_barrier, conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - ({ flush_pending_writes(conf); - md_kick_device(conf->mddev); })); + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -682,37 +661,36 @@ static void unfreeze_array(conf_t *conf) /* duplicate the data pages for behind I/O - * We return a list of bio_vec rather than just page pointers - * as it makes freeing easier */ -static struct bio_vec *alloc_behind_pages(struct bio *bio) +static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) { int i; struct bio_vec *bvec; - struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), + struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), GFP_NOIO); if (unlikely(!pages)) - goto do_sync_io; + return; bio_for_each_segment(bvec, bio, i) { - pages[i].bv_page = alloc_page(GFP_NOIO); - if (unlikely(!pages[i].bv_page)) + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) goto do_sync_io; - memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, + memcpy(kmap(pages[i]) + bvec->bv_offset, kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(pages[i].bv_page); + kunmap(pages[i]); kunmap(bvec->bv_page); } - - return pages; + r1_bio->behind_pages = pages; + r1_bio->behind_page_count = bio->bi_vcnt; + set_bit(R1BIO_BehindIO, &r1_bio->state); + return; do_sync_io: - if (pages) - for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) - put_page(pages[i].bv_page); + for (i = 0; i < bio->bi_vcnt; i++) + if (pages[i]) + put_page(pages[i]); kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); - return NULL; } static int make_request(mddev_t *mddev, struct bio * bio) @@ -724,11 +702,11 @@ static int make_request(mddev_t *mddev, struct bio * bio) int i, targets = 0, disks; struct bitmap *bitmap; unsigned long flags; - struct bio_vec *behind_pages = NULL; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; + int plugged; /* * Register the new request and wait if the reconstruction @@ -820,6 +798,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) * inc refcount on their rdev. Record them by setting * bios[x] to bio */ + plugged = mddev_check_plugged(mddev); + disks = conf->raid_disks; retry_write: blocked_rdev = NULL; @@ -874,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait) && - (behind_pages = alloc_behind_pages(bio)) != NULL) - set_bit(R1BIO_BehindIO, &r1_bio->state); + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(bio, r1_bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -897,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; - if (behind_pages) { + if (r1_bio->behind_pages) { struct bio_vec *bvec; int j; @@ -909,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) * them all */ __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = behind_pages[j].bv_page; + bvec->bv_page = r1_bio->behind_pages[j]; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } @@ -919,13 +898,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); } - r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); - kfree(behind_pages); /* the behind pages are attached to the bios now */ + r1_bio_write_done(r1_bio); /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (do_sync || !bitmap) + if (do_sync || !bitmap || !plugged) md_wakeup_thread(mddev->thread); return 0; @@ -1200,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error) } } -static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +static int fix_sync_read_error(r1bio_t *r1_bio) { + /* Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. + */ + mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; - int i; - int disks = conf->raid_disks; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; + struct bio *bio = r1_bio->bios[r1_bio->read_disk]; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + mdk_rdev_t *rdev; + int start; - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* We have read all readable devices. If we haven't - * got the block, then there is no hope left. - * If we have, then we want to do a comparison - * and skip the write if everything is the same. - * If any blocks failed to read, then we need to - * attempt an over-write - */ - int primary; - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) - md_error(mddev, conf->mirrors[i].rdev); + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false)) { + success = 1; + break; + } + } + d++; + if (d == conf->raid_disks) + d = 0; + } while (!success && d != r1_bio->read_disk); - md_done_sync(mddev, r1_bio->sectors, 1); + if (!success) { + char b[BDEVNAME_SIZE]; + /* Cannot read from anywhere, array is toast */ + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" + " for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev, b), + (unsigned long long)r1_bio->sector); + md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); - return; + return 0; } - for (primary=0; primary<mddev->raid_disks; primary++) - if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { - r1_bio->bios[primary]->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[primary].rdev, mddev); - break; - } - r1_bio->read_disk = primary; - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) { - int j; - int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); - struct bio *pbio = r1_bio->bios[primary]; - struct bio *sbio = r1_bio->bios[i]; - - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { - for (j = vcnt; j-- ; ) { - struct page *p, *s; - p = pbio->bi_io_vec[j].bv_page; - s = sbio->bi_io_vec[j].bv_page; - if (memcmp(page_address(p), - page_address(s), - PAGE_SIZE)) - break; - } - } else - j = 0; - if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { - sbio->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[i].rdev, mddev); - } else { - /* fixup the bio for reuse */ - int size; - sbio->bi_vcnt = vcnt; - sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; - sbio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - size = sbio->bi_size; - for (j = 0; j < vcnt ; j++) { - struct bio_vec *bi; - bi = &sbio->bi_io_vec[j]; - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } - } - } + start = d; + /* write it back and re-read */ + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + WRITE, false) == 0) { + r1_bio->bios[d]->bi_end_io = NULL; + rdev_dec_pending(rdev, mddev); + md_error(mddev, rdev); + } else + atomic_add(s, &rdev->corrected_errors); + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false) == 0) + md_error(mddev, rdev); + } + sectors -= s; + sect += s; + idx ++; } - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* ouch - failed to read all of that. - * Try some synchronous reads of other devices to get - * good data, much like with normal read errors. Only - * read into the pages we already have so we don't - * need to re-issue the read request. - * We don't need to freeze the array, because being in an - * active sync request, there is no normal IO, and - * no overlapping syncs. - */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - int idx = 0; - - while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - mdk_rdev_t *rdev; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - do { - if (r1_bio->bios[d]->bi_end_io == end_sync_read) { - /* No rcu protection needed here devices - * can only be removed when no resync is - * active, and resync is currently active - */ - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false)) { - success = 1; - break; - } - } - d++; - if (d == conf->raid_disks) - d = 0; - } while (!success && d != r1_bio->read_disk); - - if (success) { - int start = d; - /* write it back and re-read */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - WRITE, false) == 0) - md_error(mddev, rdev); - } - d = start; - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false) == 0) - md_error(mddev, rdev); - } - } else { - char b[BDEVNAME_SIZE]; - /* Cannot read from anywhere, array is toast */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" - " for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), - (unsigned long long)r1_bio->sector); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return; + set_bit(R1BIO_Uptodate, &r1_bio->state); + set_bit(BIO_UPTODATE, &bio->bi_flags); + return 1; +} + +static int process_checks(r1bio_t *r1_bio) +{ + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + mddev_t *mddev = r1_bio->mddev; + conf_t *conf = mddev->private; + int primary; + int i; + + for (primary = 0; primary < conf->raid_disks; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + for (i = 0; i < conf->raid_disks; i++) { + int j; + int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + int size; + + if (r1_bio->bios[i]->bi_end_io != end_sync_read) + continue; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + PAGE_SIZE)) + break; } - sectors -= s; - sect += s; - idx ++; + } else + j = 0; + if (j >= 0) + mddev->resync_mismatches += r1_bio->sectors; + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { + /* No need to write to this device. */ + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + continue; + } + /* fixup the bio for reuse */ + sbio->bi_vcnt = vcnt; + sbio->bi_size = r1_bio->sectors << 9; + sbio->bi_idx = 0; + sbio->bi_phys_segments = 0; + sbio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bi_next = NULL; + sbio->bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), + page_address(pbio->bi_io_vec[j].bv_page), + PAGE_SIZE); } } + return 0; +} + +static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +{ + conf_t *conf = mddev->private; + int i; + int disks = conf->raid_disks; + struct bio *bio, *wbio; + + bio = r1_bio->bios[r1_bio->read_disk]; + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + /* ouch - failed to read all of that. */ + if (!fix_sync_read_error(r1_bio)) + return; + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (process_checks(r1_bio) < 0) + return; /* * schedule writes */ @@ -1516,13 +1510,16 @@ static void raid1d(mddev_t *mddev) conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; mdk_rdev_t *rdev; + struct blk_plug plug; md_check_recovery(mddev); - + + blk_start_plug(&plug); for (;;) { char b[BDEVNAME_SIZE]; - flush_pending_writes(conf); + if (atomic_read(&mddev->plug_cnt) == 0) + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -1593,6 +1590,7 @@ static void raid1d(mddev_t *mddev) } cond_resched(); } + blk_finish_plug(&plug); } @@ -2039,7 +2037,6 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); @@ -2064,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && - mddev->recovery_cp == MaxSector) { + mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index cbfdf1a6acd..5fc4ca1af86 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -94,7 +94,9 @@ struct r1bio_s { int read_disk; struct list_head retry_list; - struct bitmap_update *bitmap_update; + /* Next two are only valid when R1BIO_BehindIO is set */ + struct page **behind_pages; + int behind_page_count; /* * if the IO is in WRITE direction, then multiple bios are used. * We choose the number when they are allocated. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f7b62370b37..6e846688962 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -5,7 +5,7 @@ * * RAID-10 support for md. * - * Base on code in raid1.c. See raid1.c for futher copyright information. + * Base on code in raid1.c. See raid1.c for further copyright information. * * * This program is free software; you can redistribute it and/or modify @@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) */ set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } else { /* - * oops, read error: + * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) @@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } - - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } static void raid10_end_write_request(struct bio *bio, int error) @@ -340,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error) /* * RAID10 layout manager - * Aswell as the chunksize and raid_disks count, there are two + * As well as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * - * Chunks are layed out in raid0 style with near_copies copies of the + * Chunks are laid out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned @@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, static int read_balance(conf_t *conf, r10bio_t *r10_bio) { const sector_t this_sector = r10_bio->sector; - int disk, slot, nslot; + int disk, slot; const int sectors = r10_bio->sectors; - sector_t new_distance, current_distance; + sector_t new_distance, best_dist; mdk_rdev_t *rdev; + int do_balance; + int best_slot; raid10_find_phys(conf, r10_bio); rcu_read_lock(); +retry: + best_slot = -1; + best_dist = MaxSector; + do_balance = 1; /* * Check if we can balance. We can balance on the whole * device if no resync is going on (recovery is ok), or below @@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) * above the resync window. */ if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) { - /* make sure that disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; + && (this_sector + sectors >= conf->next_resync)) + do_balance = 0; - while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot++; - if (slot == conf->copies) { - slot = 0; - disk = -1; - break; - } - disk = r10_bio->devs[slot].devnum; - } - goto rb_out; - } - - - /* make sure the disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; - while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot ++; - if (slot == conf->copies) { - disk = -1; - goto rb_out; - } + for (slot = 0; slot < conf->copies ; slot++) { + if (r10_bio->devs[slot].bio == IO_BLOCKED) + continue; disk = r10_bio->devs[slot].devnum; - } - - - current_distance = abs(r10_bio->devs[slot].addr - - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest, - * or - for far > 1 - find the closest to partition beginning */ - - for (nslot = slot; nslot < conf->copies; nslot++) { - int ndisk = r10_bio->devs[nslot].devnum; - - - if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || - r10_bio->devs[nslot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (rdev == NULL) + continue; + if (!test_bit(In_sync, &rdev->flags)) continue; + if (!do_balance) + break; + /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { - disk = ndisk; - slot = nslot; + if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) break; - } /* for far > 1 always use the lowest address */ if (conf->far_copies > 1) - new_distance = r10_bio->devs[nslot].addr; + new_distance = r10_bio->devs[slot].addr; else - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - disk = ndisk; - slot = nslot; + new_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); + if (new_distance < best_dist) { + best_dist = new_distance; + best_slot = slot; } } + if (slot == conf->copies) + slot = best_slot; -rb_out: - r10_bio->read_slot = slot; -/* conf->next_seq_sect = this_sector + sectors;*/ - - if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) - atomic_inc(&conf->mirrors[disk].rdev->nr_pending); - else + if (slot >= 0) { + disk = r10_bio->devs[slot].devnum; + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { + /* Cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + rdev_dec_pending(rdev, conf->mddev); + goto retry; + } + r10_bio->read_slot = slot; + } else disk = -1; rcu_read_unlock(); @@ -634,12 +611,6 @@ static void flush_pending_writes(conf_t *conf) spin_unlock_irq(&conf->device_lock); } -static void md_kick_device(mddev_t *mddev) -{ - blk_flush_plug(current); - md_wakeup_thread(mddev->thread); -} - /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -669,15 +640,15 @@ static void raise_barrier(conf_t *conf, int force) /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock, md_kick_device(conf->mddev)); + conf->resync_lock, ); /* block any new IO from starting */ conf->barrier++; - /* No wait for all pending IO to complete */ + /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, md_kick_device(conf->mddev)); + conf->resync_lock, ); spin_unlock_irq(&conf->resync_lock); } @@ -698,7 +669,7 @@ static void wait_barrier(conf_t *conf) conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, - md_kick_device(conf->mddev)); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -734,8 +705,8 @@ static void freeze_array(conf_t *conf) wait_event_lock_irq(conf->wait_barrier, conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - ({ flush_pending_writes(conf); - md_kick_device(conf->mddev); })); + flush_pending_writes(conf)); + spin_unlock_irq(&conf->resync_lock); } @@ -762,6 +733,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); unsigned long flags; mdk_rdev_t *blocked_rdev; + int plugged; if (unlikely(bio->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bio); @@ -870,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) * inc refcount on their rdev. Record them by setting * bios[x] to bio */ + plugged = mddev_check_plugged(mddev); + raid10_find_phys(conf, r10_bio); retry_write: blocked_rdev = NULL; @@ -946,9 +920,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (do_sync || !mddev->bitmap) + if (do_sync || !mddev->bitmap || !plugged) md_wakeup_thread(mddev->thread); - return 0; } @@ -1464,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[r10_bio->read_slot].devnum; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev) { /* If rdev is not NULL */ - char b[BDEVNAME_SIZE]; - int cur_read_error_count = 0; + /* still own a reference to this rdev, so it cannot + * have been cleared recently. + */ + rdev = conf->mirrors[d].rdev; - bdevname(rdev->bdev, b); + if (test_bit(Faulty, &rdev->flags)) + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; - if (test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); - /* drive has already been failed, just ignore any - more fix_read_error() attempts */ - return; - } + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + if (atomic_read(&rdev->read_errors) > max_read_errors) { + char b[BDEVNAME_SIZE]; + bdevname(rdev->bdev, b); - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - cur_read_error_count = atomic_read(&rdev->read_errors); - if (cur_read_error_count > max_read_errors) { - rcu_read_unlock(); - printk(KERN_NOTICE - "md/raid10:%s: %s: Raid device exceeded " - "read_error threshold " - "[cur %d:max %d]\n", - mdname(mddev), - b, cur_read_error_count, max_read_errors); - printk(KERN_NOTICE - "md/raid10:%s: %s: Failing raid " - "device\n", mdname(mddev), b); - md_error(mddev, conf->mirrors[d].rdev); - return; - } + printk(KERN_NOTICE + "md/raid10:%s: %s: Raid device exceeded " + "read_error threshold [cur %d:max %d]\n", + mdname(mddev), b, + atomic_read(&rdev->read_errors), max_read_errors); + printk(KERN_NOTICE + "md/raid10:%s: %s: Failing raid device\n", + mdname(mddev), b); + md_error(mddev, conf->mirrors[d].rdev); + return; } - rcu_read_unlock(); while(sectors) { int s = sectors; @@ -1566,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "write failed" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", @@ -1603,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "corrected sectors" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", mdname(mddev), @@ -1616,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "md/raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); } @@ -1640,9 +1606,11 @@ static void raid10d(mddev_t *mddev) conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; mdk_rdev_t *rdev; + struct blk_plug plug; md_check_recovery(mddev); + blk_start_plug(&plug); for (;;) { char b[BDEVNAME_SIZE]; @@ -1665,7 +1633,8 @@ static void raid10d(mddev_t *mddev) else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); else { - int mirror; + int slot = r10_bio->read_slot; + int mirror = r10_bio->devs[slot].devnum; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. * We freeze all other IO, and try reading the block from @@ -1679,9 +1648,10 @@ static void raid10d(mddev_t *mddev) fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } + rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); - bio = r10_bio->devs[r10_bio->read_slot].bio; - r10_bio->devs[r10_bio->read_slot].bio = + bio = r10_bio->devs[slot].bio; + r10_bio->devs[slot].bio = mddev->ro ? IO_BLOCKED : NULL; mirror = read_balance(conf, r10_bio); if (mirror == -1) { @@ -1695,6 +1665,7 @@ static void raid10d(mddev_t *mddev) } else { const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); bio_put(bio); + slot = r10_bio->read_slot; rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" @@ -1704,8 +1675,8 @@ static void raid10d(mddev_t *mddev) (unsigned long long)r10_bio->sector); bio = bio_clone_mddev(r10_bio->master_bio, GFP_NOIO, mddev); - r10_bio->devs[r10_bio->read_slot].bio = bio; - bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + r10_bio->devs[slot].bio = bio; + bio->bi_sector = r10_bio->devs[slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_rw = READ | do_sync; @@ -1716,6 +1687,7 @@ static void raid10d(mddev_t *mddev) } cond_resched(); } + blk_finish_plug(&plug); } @@ -1764,13 +1736,13 @@ static int init_resync(conf_t *conf) * */ -static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, + int *skipped, int go_faster) { conf_t *conf = mddev->private; r10bio_t *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; - int disk; int i; int max_sync; sector_t sync_blocks; @@ -1859,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int j, k; r10_bio = NULL; - for (i=0 ; i<conf->raid_disks; i++) - if (conf->mirrors[i].rdev && - !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { - int still_degraded = 0; - /* want to reconstruct this device */ - r10bio_t *rb2 = r10_bio; - sector_t sect = raid10_find_virt(conf, sector_nr, i); - int must_sync; - /* Unless we are doing a full sync, we only need - * to recover the block if it is set in the bitmap - */ - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); - if (sync_blocks < max_sync) - max_sync = sync_blocks; - if (!must_sync && - !conf->fullsync) { - /* yep, skip the sync_blocks here, but don't assume - * that there will never be anything to do here - */ - chunks_skipped = -1; - continue; - } + for (i=0 ; i<conf->raid_disks; i++) { + int still_degraded; + r10bio_t *rb2; + sector_t sect; + int must_sync; - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - raise_barrier(conf, rb2 != NULL); - atomic_set(&r10_bio->remaining, 0); + if (conf->mirrors[i].rdev == NULL || + test_bit(In_sync, &conf->mirrors[i].rdev->flags)) + continue; - r10_bio->master_bio = (struct bio*)rb2; - if (rb2) - atomic_inc(&rb2->remaining); - r10_bio->mddev = mddev; - set_bit(R10BIO_IsRecover, &r10_bio->state); - r10_bio->sector = sect; + still_degraded = 0; + /* want to reconstruct this device */ + rb2 = r10_bio; + sect = raid10_find_virt(conf, sector_nr, i); + /* Unless we are doing a full sync, we only need + * to recover the block if it is set in the bitmap + */ + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); + if (sync_blocks < max_sync) + max_sync = sync_blocks; + if (!must_sync && + !conf->fullsync) { + /* yep, skip the sync_blocks here, but don't assume + * that there will never be anything to do here + */ + chunks_skipped = -1; + continue; + } - raid10_find_phys(conf, r10_bio); + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + raise_barrier(conf, rb2 != NULL); + atomic_set(&r10_bio->remaining, 0); - /* Need to check if the array will still be - * degraded - */ - for (j=0; j<conf->raid_disks; j++) - if (conf->mirrors[j].rdev == NULL || - test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { - still_degraded = 1; - break; - } - - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); - - for (j=0; j<conf->copies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev && - test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { - /* This is where we read from */ - bio = r10_bio->devs[0].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_read; - bio->bi_rw = READ; - bio->bi_sector = r10_bio->devs[j].addr + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - /* and we write to 'i' */ - - for (k=0; k<conf->copies; k++) - if (r10_bio->devs[k].devnum == i) - break; - BUG_ON(k == conf->copies); - bio = r10_bio->devs[1].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; - bio->bi_sector = r10_bio->devs[k].addr + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - - r10_bio->devs[0].devnum = d; - r10_bio->devs[1].devnum = i; + r10_bio->master_bio = (struct bio*)rb2; + if (rb2) + atomic_inc(&rb2->remaining); + r10_bio->mddev = mddev; + set_bit(R10BIO_IsRecover, &r10_bio->state); + r10_bio->sector = sect; - break; - } - } - if (j == conf->copies) { - /* Cannot recover, so abort the recovery */ - put_buf(r10_bio); - if (rb2) - atomic_dec(&rb2->remaining); - r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - printk(KERN_INFO "md/raid10:%s: insufficient " - "working devices for recovery.\n", - mdname(mddev)); + raid10_find_phys(conf, r10_bio); + + /* Need to check if the array will still be + * degraded + */ + for (j=0; j<conf->raid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { + still_degraded = 1; break; } + + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); + + for (j=0; j<conf->copies;j++) { + int d = r10_bio->devs[j].devnum; + if (!conf->mirrors[d].rdev || + !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) + continue; + /* This is where we read from */ + bio = r10_bio->devs[0].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = READ; + bio->bi_sector = r10_bio->devs[j].addr + + conf->mirrors[d].rdev->data_offset; + bio->bi_bdev = conf->mirrors[d].rdev->bdev; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + /* and we write to 'i' */ + + for (k=0; k<conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + BUG_ON(k == conf->copies); + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = r10_bio->devs[k].addr + + conf->mirrors[i].rdev->data_offset; + bio->bi_bdev = conf->mirrors[i].rdev->bdev; + + r10_bio->devs[0].devnum = d; + r10_bio->devs[1].devnum = i; + + break; + } + if (j == conf->copies) { + /* Cannot recover, so abort the recovery */ + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", + mdname(mddev)); + break; } + } if (biolist == NULL) { while (r10_bio) { r10bio_t *rb2 = r10_bio; @@ -1978,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && - !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, + &mddev->recovery)) { /* We can skip this block */ *skipped = 1; return sync_blocks + sectors_skipped; @@ -2023,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; i<conf->copies; i++) { int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio->bi_end_io) - rdev_dec_pending(conf->mirrors[d].rdev, mddev); + rdev_dec_pending(conf->mirrors[d].rdev, + mddev); } put_buf(r10_bio); biolist = NULL; @@ -2048,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i do { struct page *page; int len = PAGE_SIZE; - disk = 0; if (sector_nr + (len>>9) > max_sector) len = (max_sector - sector_nr) << 9; if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { + struct bio *bio2; page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0) == 0) { - /* stop here */ - struct bio *bio2; - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { - /* remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_size -= len; - bio2->bi_flags &= ~(1<< BIO_SEG_VALID); - } - goto bio_full; + if (bio_add_page(bio, page, len, 0)) + continue; + + /* stop here */ + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; + for (bio2 = biolist; + bio2 && bio2 != bio; + bio2 = bio2->bi_next) { + /* remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_size -= len; + bio2->bi_flags &= ~(1<< BIO_SEG_VALID); } - disk = i; + goto bio_full; } nr_sectors += len>>9; sector_nr += len>>9; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 2316ac2e8e2..944b1104d3b 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -17,8 +17,8 @@ struct r10_private_data_s { spinlock_t device_lock; /* geometry */ - int near_copies; /* number of copies layed out raid0 style */ - int far_copies; /* number of copies layed out + int near_copies; /* number of copies laid out raid0 style */ + int far_copies; /* number of copies laid out * at large strides across drives */ int far_offset; /* far_copies are offset by 1 stripe diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e867ee42b15..346e69bfdab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -27,12 +27,12 @@ * * We group bitmap updates into batches. Each batch has a number. * We may write out several batches at once, but that isn't very important. - * conf->bm_write is the number of the last batch successfully written. - * conf->bm_flush is the number of the last batch that was closed to + * conf->seq_write is the number of the last batch successfully written. + * conf->seq_flush is the number of the last batch that was closed to * new additions. * When we discover that we will need to write to any block in a stripe * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq - * the number of the batch it will be in. This is bm_flush+1. + * the number of the batch it will be in. This is seq_flush+1. * When we are ready to do a write, if that batch hasn't been written yet, * we plug the array and queue the stripe for later. * When an unplug happens, we increment bm_flush, thus closing the current @@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - plugger_set_plug(&conf->plug); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - sh->bm_seq - conf->seq_write > 0) { + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); - plugger_set_plug(&conf->plug); - } else { + else { clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } @@ -461,7 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - md_raid5_kick_device(conf)); + ); conf->inactive_blocked = 0; } else init_stripe(sh, sector, previous); @@ -1470,7 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list), conf->device_lock, - blk_flush_plug(current)); + ); osh = get_free_stripe(conf); spin_unlock_irq(&conf->device_lock); atomic_set(&nsh->count, 1); @@ -1702,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) raid5_conf_t *conf = mddev->private; pr_debug("raid456: error called\n"); - if (!test_bit(Faulty, &rdev->flags)) { - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery was running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } - set_bit(Faulty, &rdev->flags); - printk(KERN_ALERT - "md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * if recovery was running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid:%s: Disk failure on %s, disabling device.\n" + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } /* @@ -3623,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->hold_list); } - } else - plugger_set_plug(&conf->plug); + } } static void activate_bit_delay(raid5_conf_t *conf) @@ -3641,21 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf) } } -void md_raid5_kick_device(raid5_conf_t *conf) -{ - blk_flush_plug(current); - raid5_activate_delayed(conf); - md_wakeup_thread(conf->mddev->thread); -} -EXPORT_SYMBOL_GPL(md_raid5_kick_device); - -static void raid5_unplug(struct plug_handle *plug) -{ - raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); - - md_raid5_kick_device(conf); -} - int md_raid5_congested(mddev_t *mddev, int bits) { raid5_conf_t *conf = mddev->private; @@ -3945,6 +3925,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; + int plugged; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -3963,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int disks, data_disks; @@ -3976,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse reshape_progress could change after + * Of course reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. @@ -4057,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) * add failed due to overlap. Flush everything * and wait a while */ - md_raid5_kick_device(conf); + md_wakeup_thread(mddev->thread); release_stripe(sh); schedule(); goto retry; @@ -4077,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi) } } + if (!plugged) + md_wakeup_thread(mddev->thread); + spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_phys_segments(bi); spin_unlock_irq(&conf->device_lock); @@ -4478,24 +4463,30 @@ static void raid5d(mddev_t *mddev) struct stripe_head *sh; raid5_conf_t *conf = mddev->private; int handled; + struct blk_plug plug; pr_debug("+++ raid5d active\n"); md_check_recovery(mddev); + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; - if (conf->seq_flush != conf->seq_write) { - int seq = conf->seq_flush; + if (atomic_read(&mddev->plug_cnt) == 0 && + !list_empty(&conf->bitmap_list)) { + /* Now is a good time to flush some bitmap updates */ + conf->seq_flush++; spin_unlock_irq(&conf->device_lock); bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); - conf->seq_write = seq; + conf->seq_write = conf->seq_flush; activate_bit_delay(conf); } + if (atomic_read(&mddev->plug_cnt) == 0) + raid5_activate_delayed(conf); while ((bio = remove_bio_from_retry(conf))) { int ok; @@ -4525,6 +4516,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); async_tx_issue_pending_all(); + blk_finish_plug(&plug); pr_debug("--- raid5d inactive\n"); } @@ -5141,8 +5133,6 @@ static int run(mddev_t *mddev) mdname(mddev)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - plugger_init(&conf->plug, raid5_unplug); - mddev->plug = &conf->plug; if (mddev->queue) { int chunk_size; /* read-ahead size must cover two whole stripes, which @@ -5159,7 +5149,6 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->queue->queue_lock = &conf->device_lock; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -5192,7 +5181,6 @@ static int stop(mddev_t *mddev) mddev->thread = NULL; if (mddev->queue) mddev->queue->backing_dev_info.congested_fn = NULL; - plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/ free_conf(conf); mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; @@ -5401,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); - if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } @@ -5688,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state) static void *raid45_takeover_raid0(mddev_t *mddev, int level) { struct raid0_private_data *raid0_priv = mddev->private; + sector_t sectors; /* for raid0 takeover only one zone is supported */ if (raid0_priv->nr_strip_zones > 1) { @@ -5696,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level) return ERR_PTR(-EINVAL); } + sectors = raid0_priv->strip_zone[0].zone_end; + sector_div(sectors, raid0_priv->strip_zone[0].nb_dev); + mddev->dev_sectors = sectors; mddev->new_level = level; mddev->new_layout = ALGORITHM_PARITY_N; mddev->new_chunk_sectors = mddev->chunk_sectors; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 8d563a4f022..3ca77a2613b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -400,8 +400,6 @@ struct raid5_private_data { * Cleared when a sync completes. */ - struct plug_handle plug; - /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ |