summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c10
-rw-r--r--drivers/md/bitmap.h2
-rw-r--r--drivers/md/dm-raid.c8
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-table.c114
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/md.c113
-rw-r--r--drivers/md/md.h28
-rw-r--r--drivers/md/multipath.c60
-rw-r--r--drivers/md/multipath.h1
-rw-r--r--drivers/md/raid1.c535
-rw-r--r--drivers/md/raid1.h4
-rw-r--r--drivers/md/raid10.c457
-rw-r--r--drivers/md/raid10.h4
-rw-r--r--drivers/md/raid5.c109
-rw-r--r--drivers/md/raid5.h2
16 files changed, 734 insertions, 717 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5c9362792f1..70bd738b8b9 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags);
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events = cpu_to_le64(bitmap->mddev->events);
- if (bitmap->mddev->events < bitmap->events_cleared) {
+ if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
- sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
- }
+ sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+ sb->state = cpu_to_le32(bitmap->flags);
/* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -618,7 +618,7 @@ success:
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
bitmap->flags |= BITMAP_HOSTENDIAN;
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
- if (sb->state & cpu_to_le32(BITMAP_STALE))
+ if (bitmap->flags & BITMAP_STALE)
bitmap->events_cleared = bitmap->mddev->events;
err = 0;
out:
@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
switch (op) {
case MASK_SET:
sb->state |= cpu_to_le32(bits);
+ bitmap->flags |= bits;
break;
case MASK_UNSET:
sb->state &= cpu_to_le32(~bits);
+ bitmap->flags &= ~bits;
break;
default:
BUG();
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 931a7a7c379..d0aeaf46d93 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -45,7 +45,7 @@
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
+ * bit can be cleared as well, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5ef136cdba9..e5d8904fc8f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -390,13 +390,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
return md_raid5_congested(&rs->md, bits);
}
-static void raid_unplug(struct dm_target_callbacks *cb)
-{
- struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
-
- md_raid5_kick_device(rs->md.private);
-}
-
/*
* Construct a RAID4/5/6 mapping:
* Args:
@@ -487,7 +480,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
rs->callbacks.congested_fn = raid_is_congested;
- rs->callbacks.unplug_fn = raid_unplug;
dm_table_add_target_callbacks(ti->table, &rs->callbacks);
return 0;
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index dad011aed0c..7771ed21218 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
/*
* Possible cases:
* 1) DM_RH_DIRTY
- * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+ * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed
* 3) DM_RH_RECOVERING: flushing pending writes
* Either case, the region should have not been connected to list.
*/
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 416d4e258df..cb8380c9767 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -927,20 +927,80 @@ static int dm_table_build_index(struct dm_table *t)
}
/*
+ * Get a disk whose integrity profile reflects the table's profile.
+ * If %match_all is true, all devices' profiles must match.
+ * If %match_all is false, all devices must at least have an
+ * allocated integrity profile; but uninitialized is ok.
+ * Returns NULL if integrity support was inconsistent or unavailable.
+ */
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
+ bool match_all)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd = NULL;
+ struct gendisk *prev_disk = NULL, *template_disk = NULL;
+
+ list_for_each_entry(dd, devices, list) {
+ template_disk = dd->dm_dev.bdev->bd_disk;
+ if (!blk_get_integrity(template_disk))
+ goto no_integrity;
+ if (!match_all && !blk_integrity_is_initialized(template_disk))
+ continue; /* skip uninitialized profiles */
+ else if (prev_disk &&
+ blk_integrity_compare(prev_disk, template_disk) < 0)
+ goto no_integrity;
+ prev_disk = template_disk;
+ }
+
+ return template_disk;
+
+no_integrity:
+ if (prev_disk)
+ DMWARN("%s: integrity not set: %s and %s profile mismatch",
+ dm_device_name(t->md),
+ prev_disk->disk_name,
+ template_disk->disk_name);
+ return NULL;
+}
+
+/*
* Register the mapped device for blk_integrity support if
- * the underlying devices support it.
+ * the underlying devices have an integrity profile. But all devices
+ * may not have matching profiles (checking all devices isn't reliable
+ * during table load because this table may use other DM device(s) which
+ * must be resumed before they will have an initialized integity profile).
+ * Stacked DM devices force a 2 stage integrity profile validation:
+ * 1 - during load, validate all initialized integrity profiles match
+ * 2 - during resume, validate all integrity profiles match
*/
static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
{
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_dev_internal *dd;
+ struct gendisk *template_disk = NULL;
- list_for_each_entry(dd, devices, list)
- if (bdev_get_integrity(dd->dm_dev.bdev)) {
- t->integrity_supported = 1;
- return blk_integrity_register(dm_disk(md), NULL);
- }
+ template_disk = dm_table_get_integrity_disk(t, false);
+ if (!template_disk)
+ return 0;
+ if (!blk_integrity_is_initialized(dm_disk(md))) {
+ t->integrity_supported = 1;
+ return blk_integrity_register(dm_disk(md), NULL);
+ }
+
+ /*
+ * If DM device already has an initalized integrity
+ * profile the new profile should not conflict.
+ */
+ if (blk_integrity_is_initialized(template_disk) &&
+ blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+ DMWARN("%s: conflict with existing integrity profile: "
+ "%s profile mismatch",
+ dm_device_name(t->md),
+ template_disk->disk_name);
+ return 1;
+ }
+
+ /* Preserve existing initialized integrity profile */
+ t->integrity_supported = 1;
return 0;
}
@@ -1094,41 +1154,27 @@ combine_limits:
/*
* Set the integrity profile for this device if all devices used have
- * matching profiles.
+ * matching profiles. We're quite deep in the resume path but still
+ * don't know if all devices (particularly DM devices this device
+ * may be stacked on) have matching profiles. Even if the profiles
+ * don't match we have no way to fail (to resume) at this point.
*/
static void dm_table_set_integrity(struct dm_table *t)
{
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_dev_internal *prev = NULL, *dd = NULL;
+ struct gendisk *template_disk = NULL;
if (!blk_get_integrity(dm_disk(t->md)))
return;
- list_for_each_entry(dd, devices, list) {
- if (prev &&
- blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
- dd->dm_dev.bdev->bd_disk) < 0) {
- DMWARN("%s: integrity not set: %s and %s mismatch",
- dm_device_name(t->md),
- prev->dm_dev.bdev->bd_disk->disk_name,
- dd->dm_dev.bdev->bd_disk->disk_name);
- goto no_integrity;
- }
- prev = dd;
+ template_disk = dm_table_get_integrity_disk(t, true);
+ if (!template_disk &&
+ blk_integrity_is_initialized(dm_disk(t->md))) {
+ DMWARN("%s: device no longer has a valid integrity profile",
+ dm_device_name(t->md));
+ return;
}
-
- if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
- goto no_integrity;
-
blk_integrity_register(dm_disk(t->md),
- bdev_get_integrity(prev->dm_dev.bdev));
-
- return;
-
-no_integrity:
- blk_integrity_register(dm_disk(t->md), NULL);
-
- return;
+ blk_get_integrity(template_disk));
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 339fdc67075..23078dabb6d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -30,7 +30,7 @@
*
* Different modes can be active at a time, but only
* one can be set at array creation. Others can be added later.
- * A mode can be one-shot or recurrent with the recurrance being
+ * A mode can be one-shot or recurrent with the recurrence being
* once in every N requests.
* The bottom 5 bits of the "layout" indicate the mode. The
* remainder indicate a period, or 0 for one-shot.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8b66e04c2ea..aa640a85bb2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -447,48 +447,59 @@ EXPORT_SYMBOL(md_flush_request);
/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
- * require having a whole queue
+ * require having a whole queue or request structures.
+ * We allocate an md_plug_cb for each md device and each thread it gets
+ * plugged on. This links tot the private plug_handle structure in the
+ * personality data where we keep a count of the number of outstanding
+ * plugs so other code can see if a plug is active.
*/
-static void plugger_work(struct work_struct *work)
-{
- struct plug_handle *plug =
- container_of(work, struct plug_handle, unplug_work);
- plug->unplug_fn(plug);
-}
-static void plugger_timeout(unsigned long data)
-{
- struct plug_handle *plug = (void *)data;
- kblockd_schedule_work(NULL, &plug->unplug_work);
-}
-void plugger_init(struct plug_handle *plug,
- void (*unplug_fn)(struct plug_handle *))
-{
- plug->unplug_flag = 0;
- plug->unplug_fn = unplug_fn;
- init_timer(&plug->unplug_timer);
- plug->unplug_timer.function = plugger_timeout;
- plug->unplug_timer.data = (unsigned long)plug;
- INIT_WORK(&plug->unplug_work, plugger_work);
-}
-EXPORT_SYMBOL_GPL(plugger_init);
+struct md_plug_cb {
+ struct blk_plug_cb cb;
+ mddev_t *mddev;
+};
-void plugger_set_plug(struct plug_handle *plug)
+static void plugger_unplug(struct blk_plug_cb *cb)
{
- if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag))
- mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1);
+ struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
+ if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
+ md_wakeup_thread(mdcb->mddev->thread);
+ kfree(mdcb);
}
-EXPORT_SYMBOL_GPL(plugger_set_plug);
-int plugger_remove_plug(struct plug_handle *plug)
+/* Check that an unplug wakeup will come shortly.
+ * If not, wakeup the md thread immediately
+ */
+int mddev_check_plugged(mddev_t *mddev)
{
- if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) {
- del_timer(&plug->unplug_timer);
- return 1;
- } else
+ struct blk_plug *plug = current->plug;
+ struct md_plug_cb *mdcb;
+
+ if (!plug)
return 0;
-}
-EXPORT_SYMBOL_GPL(plugger_remove_plug);
+ list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
+ if (mdcb->cb.callback == plugger_unplug &&
+ mdcb->mddev == mddev) {
+ /* Already on the list, move to top */
+ if (mdcb != list_first_entry(&plug->cb_list,
+ struct md_plug_cb,
+ cb.list))
+ list_move(&mdcb->cb.list, &plug->cb_list);
+ return 1;
+ }
+ }
+ /* Not currently on the callback list */
+ mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
+ if (!mdcb)
+ return 0;
+
+ mdcb->mddev = mddev;
+ mdcb->cb.callback = plugger_unplug;
+ atomic_inc(&mddev->plug_cnt);
+ list_add(&mdcb->cb.list, &plug->cb_list);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(mddev_check_plugged);
static inline mddev_t *mddev_get(mddev_t *mddev)
{
@@ -538,6 +549,7 @@ void mddev_init(mddev_t *mddev)
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
+ atomic_set(&mddev->plug_cnt, 0);
spin_lock_init(&mddev->write_lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
@@ -3158,6 +3170,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
mddev->delta_disks = 0;
+ mddev->degraded = 0;
if (mddev->pers->sync_request == NULL) {
/* this is now an array without redundancy, so
* it must always be in_sync
@@ -3311,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
char *e;
unsigned long long n = simple_strtoull(buf, &e, 10);
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
return -EBUSY;
if (cmd_match(buf, "none"))
n = MaxSector;
@@ -4334,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name)
disk->fops = &md_fops;
disk->private_data = mddev;
disk->queue = mddev->queue;
+ blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
/* Allow extended partitions. This makes the
* 'mdp' device redundant, but we can't really
* remove it now.
*/
disk->flags |= GENHD_FL_EXT_DEVT;
- add_disk(disk);
mddev->gendisk = disk;
+ /* As soon as we call add_disk(), another thread could get
+ * through to md_open, so make sure it doesn't get too far
+ */
+ mutex_lock(&mddev->open_mutex);
+ add_disk(disk);
+
error = kobject_init_and_add(&mddev->kobj, &md_ktype,
&disk_to_dev(disk)->kobj, "%s", "md");
if (error) {
@@ -4354,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_bitmap_group))
printk(KERN_DEBUG "pointless warning\n");
-
- blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
+ mutex_unlock(&mddev->open_mutex);
abort:
mutex_unlock(&disks_mutex);
if (!error && mddev->kobj.sd) {
@@ -4723,7 +4741,6 @@ static void md_clean(mddev_t *mddev)
mddev->bitmap_info.chunksize = 0;
mddev->bitmap_info.daemon_sleep = 0;
mddev->bitmap_info.max_write_behind = 0;
- mddev->plug = NULL;
}
static void __md_stop_writes(mddev_t *mddev)
@@ -5199,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
} else
super_types[mddev->major_version].
validate_super(mddev, rdev);
+ if ((info->state & (1<<MD_DISK_SYNC)) &&
+ (!test_bit(In_sync, &rdev->flags) ||
+ rdev->raid_disk != info->raid_disk)) {
+ /* This was a hot-add request, but events doesn't
+ * match, so reject it.
+ */
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = rdev->raid_disk;
else
@@ -6266,7 +6293,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
* rt is a sector_t, so could be 32bit or 64bit.
* So we divide before multiply in case it is 32bit and close
* to the limit.
- * We scale the divisor (db) by 32 to avoid loosing precision
+ * We scale the divisor (db) by 32 to avoid losing precision
* near the end of resync when the number of remaining sectors
* is close to 'db'.
* We then divide rt by 32 after multiplying by db to compensate.
@@ -6688,12 +6715,6 @@ int md_allow_write(mddev_t *mddev)
}
EXPORT_SYMBOL_GPL(md_allow_write);
-void md_unplug(mddev_t *mddev)
-{
- if (mddev->plug)
- mddev->plug->unplug_fn(mddev->plug);
-}
-
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
void md_do_sync(mddev_t *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 12215d437fc..0b1fd3f1d85 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,26 +29,6 @@
typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t;
-/* generic plugging support - like that provided with request_queue,
- * but does not require a request_queue
- */
-struct plug_handle {
- void (*unplug_fn)(struct plug_handle *);
- struct timer_list unplug_timer;
- struct work_struct unplug_work;
- unsigned long unplug_flag;
-};
-#define PLUGGED_FLAG 1
-void plugger_init(struct plug_handle *plug,
- void (*unplug_fn)(struct plug_handle *));
-void plugger_set_plug(struct plug_handle *plug);
-int plugger_remove_plug(struct plug_handle *plug);
-static inline void plugger_flush(struct plug_handle *plug)
-{
- del_timer_sync(&plug->unplug_timer);
- cancel_work_sync(&plug->unplug_work);
-}
-
/*
* MD's 'extended' device
*/
@@ -94,7 +74,7 @@ struct mdk_rdev_s
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */
-#define Blocked 8 /* An error occured on an externally
+#define Blocked 8 /* An error occurred on an externally
* managed array, don't allow writes
* until it is cleared */
wait_queue_head_t blocked_wait;
@@ -199,6 +179,9 @@ struct mddev_s
int delta_disks, new_level, new_layout;
int new_chunk_sectors;
+ atomic_t plug_cnt; /* If device is expecting
+ * more bios soon.
+ */
struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
sector_t curr_resync; /* last block scheduled */
@@ -336,7 +319,6 @@ struct mddev_s
struct list_head all_mddevs;
struct attribute_group *to_remove;
- struct plug_handle *plug; /* if used by personality */
struct bio_set *bio_set;
@@ -516,7 +498,6 @@ extern int md_integrity_register(mddev_t *mddev);
extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void restore_bitmap_write_access(struct file *file);
-extern void md_unplug(mddev_t *mddev);
extern void mddev_init(mddev_t *mddev);
extern int md_run(mddev_t *mddev);
@@ -530,4 +511,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
mddev_t *mddev);
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
mddev_t *mddev);
+extern int mddev_check_plugged(mddev_t *mddev);
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index c3589099098..3535c23af28 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
int i;
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
+ conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->multipaths[i].rdev &&
@@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits)
static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev->private;
+ char b[BDEVNAME_SIZE];
- if (conf->working_disks <= 1) {
+ if (conf->raid_disks - mddev->degraded <= 1) {
/*
* Uh oh, we can do nothing if this is our last path, but
* first check if this is a queued request for a device
* which has just failed.
*/
printk(KERN_ALERT
- "multipath: only one IO path left and IO error.\n");
+ "multipath: only one IO path left and IO error.\n");
/* leave it active... it's all we have */
- } else {
- /*
- * Mark disk as unusable
- */
- if (!test_bit(Faulty, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
- clear_bit(In_sync, &rdev->flags);
- set_bit(Faulty, &rdev->flags);
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- conf->working_disks--;
- mddev->degraded++;
- printk(KERN_ALERT "multipath: IO failure on %s,"
- " disabling IO path.\n"
- "multipath: Operation continuing"
- " on %d IO paths.\n",
- bdevname (rdev->bdev,b),
- conf->working_disks);
- }
+ return;
+ }
+ /*
+ * Mark disk as unusable
+ */
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT "multipath: IO failure on %s,"
+ " disabling IO path.\n"
+ "multipath: Operation continuing"
+ " on %d IO paths.\n",
+ bdevname(rdev->bdev, b),
+ conf->raid_disks - mddev->degraded);
}
static void print_multipath_conf (multipath_conf_t *conf)
@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
printk("(conf==NULL)\n");
return;
}
- printk(" --- wd:%d rd:%d\n", conf->working_disks,
+ printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) {
@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
PAGE_CACHE_SIZE - 1);
}
- conf->working_disks++;
+ spin_lock_irq(&conf->device_lock);
mddev->degraded--;
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
+ spin_unlock_irq(&conf->device_lock);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
md_integrity_add_rdev(rdev, mddev);
@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
int disk_idx;
struct multipath_info *disk;
mdk_rdev_t *rdev;
+ int working_disks;
if (md_check_no_bitmap(mddev))
return -EINVAL;
@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
goto out_free_conf;
}
- conf->working_disks = 0;
+ working_disks = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
}
if (!test_bit(Faulty, &rdev->flags))
- conf->working_disks++;
+ working_disks++;
}
conf->raid_disks = mddev->raid_disks;
@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
- if (!conf->working_disks) {
+ if (!working_disks) {
printk(KERN_ERR "multipath: no operational IO paths for %s\n",
mdname(mddev));
goto out_free_conf;
}
- mddev->degraded = conf->raid_disks - conf->working_disks;
+ mddev->degraded = conf->raid_disks - working_disks;
conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh));
@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev)
printk(KERN_INFO
"multipath: array %s active with %d out of %d IO paths\n",
- mdname(mddev), conf->working_disks, mddev->raid_disks);
+ mdname(mddev), conf->raid_disks - mddev->degraded,
+ mddev->raid_disks);
/*
* Ok, everything is just fine now
*/
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index d1c2a8d7839..3c5a45eb5f8 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -9,7 +9,6 @@ struct multipath_private_data {
mddev_t *mddev;
struct multipath_info *multipaths;
int raid_disks;
- int working_disks;
spinlock_t device_lock;
struct list_head retry_list;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c2a21ae56d9..5d096096f95 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
-static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv,
- int behind)
+static void r1_bio_write_done(r1bio_t *r1_bio)
{
if (atomic_dec_and_test(&r1_bio->remaining))
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
/* free extra copy of the data pages */
- int i = vcnt;
+ int i = r1_bio->behind_page_count;
while (i--)
- safe_put_page(bv[i].bv_page);
+ safe_put_page(r1_bio->behind_pages[i]);
+ kfree(r1_bio->behind_pages);
+ r1_bio->behind_pages = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
- behind);
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio);
}
@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
* Let's see if all mirrored write operations have finished
* already.
*/
- r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind);
+ r1_bio_write_done(r1_bio);
if (to_put)
bio_put(to_put);
@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{
const sector_t this_sector = r1_bio->sector;
const int sectors = r1_bio->sectors;
- int new_disk = -1;
int start_disk;
+ int best_disk;
int i;
- sector_t new_distance, current_distance;
+ sector_t best_dist;
mdk_rdev_t *rdev;
int choose_first;
@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
* We take the first readable disk when above the resync window.
*/
retry:
+ best_disk = -1;
+ best_dist = MaxSector;
if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) {
choose_first = 1;
@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
start_disk = conf->last_used;
}
- /* make sure the disk is operational */
for (i = 0 ; i < conf->raid_disks ; i++) {
+ sector_t dist;
int disk = start_disk + i;
if (disk >= conf->raid_disks)
disk -= conf->raid_disks;
@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
- || !test_bit(In_sync, &rdev->flags))
+ || test_bit(Faulty, &rdev->flags))
continue;
-
- new_disk = disk;
- if (!test_bit(WriteMostly, &rdev->flags))
- break;
- }
-
- if (new_disk < 0 || choose_first)
- goto rb_out;
-
- /*
- * Don't change to another disk for sequential reads:
- */
- if (conf->next_seq_sect == this_sector)
- goto rb_out;
- if (this_sector == conf->mirrors[new_disk].head_position)
- goto rb_out;
-
- current_distance = abs(this_sector
- - conf->mirrors[new_disk].head_position);
-
- /* look for a better disk - i.e. head is closer */
- start_disk = new_disk;
- for (i = 1; i < conf->raid_disks; i++) {
- int disk = start_disk + 1;
- if (disk >= conf->raid_disks)
- disk -= conf->raid_disks;
-
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
- if (r1_bio->bios[disk] == IO_BLOCKED
- || rdev == NULL
- || !test_bit(In_sync, &rdev->flags)
- || test_bit(WriteMostly, &rdev->flags))
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < this_sector + sectors)
continue;
-
- if (!atomic_read(&rdev->nr_pending)) {
- new_disk = disk;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ /* Don't balance among write-mostly, just
+ * use the first as a last resort */
+ if (best_disk < 0)
+ best_disk = disk;
+ continue;
+ }
+ /* This is a reasonable device to use. It might
+ * even be best.
+ */
+ dist = abs(this_sector - conf->mirrors[disk].head_position);
+ if (choose_first
+ /* Don't change to another disk for sequential reads */
+ || conf->next_seq_sect == this_sector
+ || dist == 0
+ /* If device is idle, use it */
+ || atomic_read(&rdev->nr_pending) == 0) {
+ best_disk = disk;
break;
}
- new_distance = abs(this_sector - conf->mirrors[disk].head_position);
- if (new_distance < current_distance) {
- current_distance = new_distance;
- new_disk = disk;
+ if (dist < best_dist) {
+ best_dist = dist;
+ best_disk = disk;
}
}
- rb_out:
- if (new_disk >= 0) {
- rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+ if (best_disk >= 0) {
+ rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
- if (!test_bit(In_sync, &rdev->flags)) {
+ if (test_bit(Faulty, &rdev->flags)) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
goto retry;
}
conf->next_seq_sect = this_sector + sectors;
- conf->last_used = new_disk;
+ conf->last_used = best_disk;
}
rcu_read_unlock();
- return new_disk;
+ return best_disk;
}
static int raid1_congested(void *data, int bits)
@@ -565,12 +551,6 @@ static void flush_pending_writes(conf_t *conf)
spin_unlock_irq(&conf->device_lock);
}
-static void md_kick_device(mddev_t *mddev)
-{
- blk_flush_plug(current);
- md_wakeup_thread(mddev->thread);
-}
-
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -600,7 +580,7 @@ static void raise_barrier(conf_t *conf)
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
- conf->resync_lock, md_kick_device(conf->mddev));
+ conf->resync_lock, );
/* block any new IO from starting */
conf->barrier++;
@@ -608,7 +588,7 @@ static void raise_barrier(conf_t *conf)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock, md_kick_device(conf->mddev));
+ conf->resync_lock, );
spin_unlock_irq(&conf->resync_lock);
}
@@ -630,7 +610,7 @@ static void wait_barrier(conf_t *conf)
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
- md_kick_device(conf->mddev));
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -666,8 +646,7 @@ static void freeze_array(conf_t *conf)
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- ({ flush_pending_writes(conf);
- md_kick_device(conf->mddev); }));
+ flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(conf_t *conf)
@@ -682,37 +661,36 @@ static void unfreeze_array(conf_t *conf)
/* duplicate the data pages for behind I/O
- * We return a list of bio_vec rather than just page pointers
- * as it makes freeing easier
*/
-static struct bio_vec *alloc_behind_pages(struct bio *bio)
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
{
int i;
struct bio_vec *bvec;
- struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+ struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
GFP_NOIO);
if (unlikely(!pages))
- goto do_sync_io;
+ return;
bio_for_each_segment(bvec, bio, i) {
- pages[i].bv_page = alloc_page(GFP_NOIO);
- if (unlikely(!pages[i].bv_page))
+ pages[i] = alloc_page(GFP_NOIO);
+ if (unlikely(!pages[i]))
goto do_sync_io;
- memcpy(kmap(pages[i].bv_page) + bvec->bv_offset,
+ memcpy(kmap(pages[i]) + bvec->bv_offset,
kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
- kunmap(pages[i].bv_page);
+ kunmap(pages[i]);
kunmap(bvec->bv_page);
}
-
- return pages;
+ r1_bio->behind_pages = pages;
+ r1_bio->behind_page_count = bio->bi_vcnt;
+ set_bit(R1BIO_BehindIO, &r1_bio->state);
+ return;
do_sync_io:
- if (pages)
- for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++)
- put_page(pages[i].bv_page);
+ for (i = 0; i < bio->bi_vcnt; i++)
+ if (pages[i])
+ put_page(pages[i]);
kfree(pages);
PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
- return NULL;
}
static int make_request(mddev_t *mddev, struct bio * bio)
@@ -724,11 +702,11 @@ static int make_request(mddev_t *mddev, struct bio * bio)
int i, targets = 0, disks;
struct bitmap *bitmap;
unsigned long flags;
- struct bio_vec *behind_pages = NULL;
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
+ int plugged;
/*
* Register the new request and wait if the reconstruction
@@ -820,6 +798,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ plugged = mddev_check_plugged(mddev);
+
disks = conf->raid_disks;
retry_write:
blocked_rdev = NULL;
@@ -874,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait) &&
- (behind_pages = alloc_behind_pages(bio)) != NULL)
- set_bit(R1BIO_BehindIO, &r1_bio->state);
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(bio, r1_bio);
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
@@ -897,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio;
- if (behind_pages) {
+ if (r1_bio->behind_pages) {
struct bio_vec *bvec;
int j;
@@ -909,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* them all
*/
__bio_for_each_segment(bvec, mbio, j, 0)
- bvec->bv_page = behind_pages[j].bv_page;
+ bvec->bv_page = r1_bio->behind_pages[j];
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
@@ -919,13 +898,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
- r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
- kfree(behind_pages); /* the behind pages are attached to the bios now */
+ r1_bio_write_done(r1_bio);
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
- if (do_sync || !bitmap)
+ if (do_sync || !bitmap || !plugged)
md_wakeup_thread(mddev->thread);
return 0;
@@ -1200,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error)
}
}
-static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+static int fix_sync_read_error(r1bio_t *r1_bio)
{
+ /* Try some synchronous reads of other devices to get
+ * good data, much like with normal read errors. Only
+ * read into the pages we already have so we don't
+ * need to re-issue the read request.
+ * We don't need to freeze the array, because being in an
+ * active sync request, there is no normal IO, and
+ * no overlapping syncs.
+ */
+ mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
- int i;
- int disks = conf->raid_disks;
- struct bio *bio, *wbio;
-
- bio = r1_bio->bios[r1_bio->read_disk];
+ struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ sector_t sect = r1_bio->sector;
+ int sectors = r1_bio->sectors;
+ int idx = 0;
+ while(sectors) {
+ int s = sectors;
+ int d = r1_bio->read_disk;
+ int success = 0;
+ mdk_rdev_t *rdev;
+ int start;
- if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* We have read all readable devices. If we haven't
- * got the block, then there is no hope left.
- * If we have, then we want to do a comparison
- * and skip the write if everything is the same.
- * If any blocks failed to read, then we need to
- * attempt an over-write
- */
- int primary;
- if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- for (i=0; i<mddev->raid_disks; i++)
- if (r1_bio->bios[i]->bi_end_io == end_sync_read)
- md_error(mddev, conf->mirrors[i].rdev);
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+ do {
+ if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+ /* No rcu protection needed here devices
+ * can only be removed when no resync is
+ * active, and resync is currently active
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false)) {
+ success = 1;
+ break;
+ }
+ }
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ } while (!success && d != r1_bio->read_disk);
- md_done_sync(mddev, r1_bio->sectors, 1);
+ if (!success) {
+ char b[BDEVNAME_SIZE];
+ /* Cannot read from anywhere, array is toast */
+ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+ " for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev, b),
+ (unsigned long long)r1_bio->sector);
+ md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
- return;
+ return 0;
}
- for (primary=0; primary<mddev->raid_disks; primary++)
- if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
- test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
- r1_bio->bios[primary]->bi_end_io = NULL;
- rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
- break;
- }
- r1_bio->read_disk = primary;
- for (i=0; i<mddev->raid_disks; i++)
- if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
- int j;
- int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
- struct bio *pbio = r1_bio->bios[primary];
- struct bio *sbio = r1_bio->bios[i];
-
- if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
- for (j = vcnt; j-- ; ) {
- struct page *p, *s;
- p = pbio->bi_io_vec[j].bv_page;
- s = sbio->bi_io_vec[j].bv_page;
- if (memcmp(page_address(p),
- page_address(s),
- PAGE_SIZE))
- break;
- }
- } else
- j = 0;
- if (j >= 0)
- mddev->resync_mismatches += r1_bio->sectors;
- if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
- && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
- sbio->bi_end_io = NULL;
- rdev_dec_pending(conf->mirrors[i].rdev, mddev);
- } else {
- /* fixup the bio for reuse */
- int size;
- sbio->bi_vcnt = vcnt;
- sbio->bi_size = r1_bio->sectors << 9;
- sbio->bi_idx = 0;
- sbio->bi_phys_segments = 0;
- sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
- sbio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bi_next = NULL;
- sbio->bi_sector = r1_bio->sector +
- conf->mirrors[i].rdev->data_offset;
- sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- size = sbio->bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &sbio->bi_io_vec[j];
- bi->bv_offset = 0;
- if (size > PAGE_SIZE)
- bi->bv_len = PAGE_SIZE;
- else
- bi->bv_len = size;
- size -= PAGE_SIZE;
- memcpy(page_address(bi->bv_page),
- page_address(pbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
- }
- }
- }
+ start = d;
+ /* write it back and re-read */
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE, false) == 0) {
+ r1_bio->bios[d]->bi_end_io = NULL;
+ rdev_dec_pending(rdev, mddev);
+ md_error(mddev, rdev);
+ } else
+ atomic_add(s, &rdev->corrected_errors);
+ }
+ d = start;
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false) == 0)
+ md_error(mddev, rdev);
+ }
+ sectors -= s;
+ sect += s;
+ idx ++;
}
- if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- /* ouch - failed to read all of that.
- * Try some synchronous reads of other devices to get
- * good data, much like with normal read errors. Only
- * read into the pages we already have so we don't
- * need to re-issue the read request.
- * We don't need to freeze the array, because being in an
- * active sync request, there is no normal IO, and
- * no overlapping syncs.
- */
- sector_t sect = r1_bio->sector;
- int sectors = r1_bio->sectors;
- int idx = 0;
-
- while(sectors) {
- int s = sectors;
- int d = r1_bio->read_disk;
- int success = 0;
- mdk_rdev_t *rdev;
-
- if (s > (PAGE_SIZE>>9))
- s = PAGE_SIZE >> 9;
- do {
- if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
- /* No rcu protection needed here devices
- * can only be removed when no resync is
- * active, and resync is currently active
- */
- rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- READ, false)) {
- success = 1;
- break;
- }
- }
- d++;
- if (d == conf->raid_disks)
- d = 0;
- } while (!success && d != r1_bio->read_disk);
-
- if (success) {
- int start = d;
- /* write it back and re-read */
- set_bit(R1BIO_Uptodate, &r1_bio->state);
- while (d != r1_bio->read_disk) {
- if (d == 0)
- d = conf->raid_disks;
- d--;
- if (r1_bio->bios[d]->bi_end_io != end_sync_read)
- continue;
- rdev = conf->mirrors[d].rdev;
- atomic_add(s, &rdev->corrected_errors);
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- WRITE, false) == 0)
- md_error(mddev, rdev);
- }
- d = start;
- while (d != r1_bio->read_disk) {
- if (d == 0)
- d = conf->raid_disks;
- d--;
- if (r1_bio->bios[d]->bi_end_io != end_sync_read)
- continue;
- rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- READ, false) == 0)
- md_error(mddev, rdev);
- }
- } else {
- char b[BDEVNAME_SIZE];
- /* Cannot read from anywhere, array is toast */
- md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
- printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
- " for block %llu\n",
- mdname(mddev),
- bdevname(bio->bi_bdev, b),
- (unsigned long long)r1_bio->sector);
- md_done_sync(mddev, r1_bio->sectors, 0);
- put_buf(r1_bio);
- return;
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ return 1;
+}
+
+static int process_checks(r1bio_t *r1_bio)
+{
+ /* We have read all readable devices. If we haven't
+ * got the block, then there is no hope left.
+ * If we have, then we want to do a comparison
+ * and skip the write if everything is the same.
+ * If any blocks failed to read, then we need to
+ * attempt an over-write
+ */
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ int primary;
+ int i;
+
+ for (primary = 0; primary < conf->raid_disks; primary++)
+ if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+ test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ r1_bio->bios[primary]->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+ break;
+ }
+ r1_bio->read_disk = primary;
+ for (i = 0; i < conf->raid_disks; i++) {
+ int j;
+ int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+ struct bio *pbio = r1_bio->bios[primary];
+ struct bio *sbio = r1_bio->bios[i];
+ int size;
+
+ if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+ continue;
+
+ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ PAGE_SIZE))
+ break;
}
- sectors -= s;
- sect += s;
- idx ++;
+ } else
+ j = 0;
+ if (j >= 0)
+ mddev->resync_mismatches += r1_bio->sectors;
+ if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+ && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+ /* No need to write to this device. */
+ sbio->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+ continue;
+ }
+ /* fixup the bio for reuse */
+ sbio->bi_vcnt = vcnt;
+ sbio->bi_size = r1_bio->sectors << 9;
+ sbio->bi_idx = 0;
+ sbio->bi_phys_segments = 0;
+ sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bi_next = NULL;
+ sbio->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ size = sbio->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &sbio->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ memcpy(page_address(bi->bv_page),
+ page_address(pbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
}
}
+ return 0;
+}
+
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+ conf_t *conf = mddev->private;
+ int i;
+ int disks = conf->raid_disks;
+ struct bio *bio, *wbio;
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ /* ouch - failed to read all of that. */
+ if (!fix_sync_read_error(r1_bio))
+ return;
+
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ if (process_checks(r1_bio) < 0)
+ return;
/*
* schedule writes
*/
@@ -1516,13 +1510,16 @@ static void raid1d(mddev_t *mddev)
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
mdk_rdev_t *rdev;
+ struct blk_plug plug;
md_check_recovery(mddev);
-
+
+ blk_start_plug(&plug);
for (;;) {
char b[BDEVNAME_SIZE];
- flush_pending_writes(conf);
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ flush_pending_writes(conf);
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head)) {
@@ -1593,6 +1590,7 @@ static void raid1d(mddev_t *mddev)
}
cond_resched();
}
+ blk_finish_plug(&plug);
}
@@ -2039,7 +2037,6 @@ static int stop(mddev_t *mddev)
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
@@ -2064,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
- mddev->recovery_cp == MaxSector) {
+ mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index cbfdf1a6acd..5fc4ca1af86 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -94,7 +94,9 @@ struct r1bio_s {
int read_disk;
struct list_head retry_list;
- struct bitmap_update *bitmap_update;
+ /* Next two are only valid when R1BIO_BehindIO is set */
+ struct page **behind_pages;
+ int behind_page_count;
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f7b62370b37..6e846688962 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
*
* RAID-10 support for md.
*
- * Base on code in raid1.c. See raid1.c for futher copyright information.
+ * Base on code in raid1.c. See raid1.c for further copyright information.
*
*
* This program is free software; you can redistribute it and/or modify
@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} else {
/*
- * oops, read error:
+ * oops, read error - keep the refcount on the rdev
*/
char b[BDEVNAME_SIZE];
if (printk_ratelimit())
@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
reschedule_retry(r10_bio);
}
-
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
static void raid10_end_write_request(struct bio *bio, int error)
@@ -340,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
/*
* RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
* parameters: near_copies and far_copies.
* near_copies * far_copies must be <= raid_disks.
* Normally one of these will be 1.
* If both are 1, we get raid0.
* If near_copies == raid_disks, we get raid1.
*
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
* first chunk, followed by near_copies copies of the next chunk and
* so on.
* If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
static int read_balance(conf_t *conf, r10bio_t *r10_bio)
{
const sector_t this_sector = r10_bio->sector;
- int disk, slot, nslot;
+ int disk, slot;
const int sectors = r10_bio->sectors;
- sector_t new_distance, current_distance;
+ sector_t new_distance, best_dist;
mdk_rdev_t *rdev;
+ int do_balance;
+ int best_slot;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
+retry:
+ best_slot = -1;
+ best_dist = MaxSector;
+ do_balance = 1;
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below
@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
* above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
- && (this_sector + sectors >= conf->next_resync)) {
- /* make sure that disk is operational */
- slot = 0;
- disk = r10_bio->devs[slot].devnum;
+ && (this_sector + sectors >= conf->next_resync))
+ do_balance = 0;
- while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
- r10_bio->devs[slot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags)) {
- slot++;
- if (slot == conf->copies) {
- slot = 0;
- disk = -1;
- break;
- }
- disk = r10_bio->devs[slot].devnum;
- }
- goto rb_out;
- }
-
-
- /* make sure the disk is operational */
- slot = 0;
- disk = r10_bio->devs[slot].devnum;
- while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
- r10_bio->devs[slot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags)) {
- slot ++;
- if (slot == conf->copies) {
- disk = -1;
- goto rb_out;
- }
+ for (slot = 0; slot < conf->copies ; slot++) {
+ if (r10_bio->devs[slot].bio == IO_BLOCKED)
+ continue;
disk = r10_bio->devs[slot].devnum;
- }
-
-
- current_distance = abs(r10_bio->devs[slot].addr -
- conf->mirrors[disk].head_position);
-
- /* Find the disk whose head is closest,
- * or - for far > 1 - find the closest to partition beginning */
-
- for (nslot = slot; nslot < conf->copies; nslot++) {
- int ndisk = r10_bio->devs[nslot].devnum;
-
-
- if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
- r10_bio->devs[nslot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags))
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (rdev == NULL)
+ continue;
+ if (!test_bit(In_sync, &rdev->flags))
continue;
+ if (!do_balance)
+ break;
+
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
- if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
- disk = ndisk;
- slot = nslot;
+ if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
break;
- }
/* for far > 1 always use the lowest address */
if (conf->far_copies > 1)
- new_distance = r10_bio->devs[nslot].addr;
+ new_distance = r10_bio->devs[slot].addr;
else
- new_distance = abs(r10_bio->devs[nslot].addr -
- conf->mirrors[ndisk].head_position);
- if (new_distance < current_distance) {
- current_distance = new_distance;
- disk = ndisk;
- slot = nslot;
+ new_distance = abs(r10_bio->devs[slot].addr -
+ conf->mirrors[disk].head_position);
+ if (new_distance < best_dist) {
+ best_dist = new_distance;
+ best_slot = slot;
}
}
+ if (slot == conf->copies)
+ slot = best_slot;
-rb_out:
- r10_bio->read_slot = slot;
-/* conf->next_seq_sect = this_sector + sectors;*/
-
- if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
- atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
- else
+ if (slot >= 0) {
+ disk = r10_bio->devs[slot].devnum;
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* Cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ r10_bio->read_slot = slot;
+ } else
disk = -1;
rcu_read_unlock();
@@ -634,12 +611,6 @@ static void flush_pending_writes(conf_t *conf)
spin_unlock_irq(&conf->device_lock);
}
-static void md_kick_device(mddev_t *mddev)
-{
- blk_flush_plug(current);
- md_wakeup_thread(mddev->thread);
-}
-
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -669,15 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock, md_kick_device(conf->mddev));
+ conf->resync_lock, );
/* block any new IO from starting */
conf->barrier++;
- /* No wait for all pending IO to complete */
+ /* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock, md_kick_device(conf->mddev));
+ conf->resync_lock, );
spin_unlock_irq(&conf->resync_lock);
}
@@ -698,7 +669,7 @@ static void wait_barrier(conf_t *conf)
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
- md_kick_device(conf->mddev));
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -734,8 +705,8 @@ static void freeze_array(conf_t *conf)
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- ({ flush_pending_writes(conf);
- md_kick_device(conf->mddev); }));
+ flush_pending_writes(conf));
+
spin_unlock_irq(&conf->resync_lock);
}
@@ -762,6 +733,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
unsigned long flags;
mdk_rdev_t *blocked_rdev;
+ int plugged;
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
@@ -870,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ plugged = mddev_check_plugged(mddev);
+
raid10_find_phys(conf, r10_bio);
retry_write:
blocked_rdev = NULL;
@@ -946,9 +920,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
- if (do_sync || !mddev->bitmap)
+ if (do_sync || !mddev->bitmap || !plugged)
md_wakeup_thread(mddev->thread);
-
return 0;
}
@@ -1464,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev) { /* If rdev is not NULL */
- char b[BDEVNAME_SIZE];
- int cur_read_error_count = 0;
+ /* still own a reference to this rdev, so it cannot
+ * have been cleared recently.
+ */
+ rdev = conf->mirrors[d].rdev;
- bdevname(rdev->bdev, b);
+ if (test_bit(Faulty, &rdev->flags))
+ /* drive has already been failed, just ignore any
+ more fix_read_error() attempts */
+ return;
- if (test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
- /* drive has already been failed, just ignore any
- more fix_read_error() attempts */
- return;
- }
+ check_decay_read_errors(mddev, rdev);
+ atomic_inc(&rdev->read_errors);
+ if (atomic_read(&rdev->read_errors) > max_read_errors) {
+ char b[BDEVNAME_SIZE];
+ bdevname(rdev->bdev, b);
- check_decay_read_errors(mddev, rdev);
- atomic_inc(&rdev->read_errors);
- cur_read_error_count = atomic_read(&rdev->read_errors);
- if (cur_read_error_count > max_read_errors) {
- rcu_read_unlock();
- printk(KERN_NOTICE
- "md/raid10:%s: %s: Raid device exceeded "
- "read_error threshold "
- "[cur %d:max %d]\n",
- mdname(mddev),
- b, cur_read_error_count, max_read_errors);
- printk(KERN_NOTICE
- "md/raid10:%s: %s: Failing raid "
- "device\n", mdname(mddev), b);
- md_error(mddev, conf->mirrors[d].rdev);
- return;
- }
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Raid device exceeded "
+ "read_error threshold [cur %d:max %d]\n",
+ mdname(mddev), b,
+ atomic_read(&rdev->read_errors), max_read_errors);
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Failing raid device\n",
+ mdname(mddev), b);
+ md_error(mddev, conf->mirrors[d].rdev);
+ return;
}
- rcu_read_unlock();
while(sectors) {
int s = sectors;
@@ -1566,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"write failed"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n",
@@ -1603,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"corrected sectors"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
mdname(mddev),
@@ -1616,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
}
@@ -1640,9 +1606,11 @@ static void raid10d(mddev_t *mddev)
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
mdk_rdev_t *rdev;
+ struct blk_plug plug;
md_check_recovery(mddev);
+ blk_start_plug(&plug);
for (;;) {
char b[BDEVNAME_SIZE];
@@ -1665,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio);
else {
- int mirror;
+ int slot = r10_bio->read_slot;
+ int mirror = r10_bio->devs[slot].devnum;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
* We freeze all other IO, and try reading the block from
@@ -1679,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
}
+ rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
- bio = r10_bio->devs[r10_bio->read_slot].bio;
- r10_bio->devs[r10_bio->read_slot].bio =
+ bio = r10_bio->devs[slot].bio;
+ r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
mirror = read_balance(conf, r10_bio);
if (mirror == -1) {
@@ -1695,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
} else {
const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
bio_put(bio);
+ slot = r10_bio->read_slot;
rdev = conf->mirrors[mirror].rdev;
if (printk_ratelimit())
printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1704,8 +1675,8 @@ static void raid10d(mddev_t *mddev)
(unsigned long long)r10_bio->sector);
bio = bio_clone_mddev(r10_bio->master_bio,
GFP_NOIO, mddev);
- r10_bio->devs[r10_bio->read_slot].bio = bio;
- bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+ r10_bio->devs[slot].bio = bio;
+ bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
bio->bi_rw = READ | do_sync;
@@ -1716,6 +1687,7 @@ static void raid10d(mddev_t *mddev)
}
cond_resched();
}
+ blk_finish_plug(&plug);
}
@@ -1764,13 +1736,13 @@ static int init_resync(conf_t *conf)
*
*/
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
+ int *skipped, int go_faster)
{
conf_t *conf = mddev->private;
r10bio_t *r10_bio;
struct bio *biolist = NULL, *bio;
sector_t max_sector, nr_sectors;
- int disk;
int i;
int max_sync;
sector_t sync_blocks;
@@ -1859,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int j, k;
r10_bio = NULL;
- for (i=0 ; i<conf->raid_disks; i++)
- if (conf->mirrors[i].rdev &&
- !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
- int still_degraded = 0;
- /* want to reconstruct this device */
- r10bio_t *rb2 = r10_bio;
- sector_t sect = raid10_find_virt(conf, sector_nr, i);
- int must_sync;
- /* Unless we are doing a full sync, we only need
- * to recover the block if it is set in the bitmap
- */
- must_sync = bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, 1);
- if (sync_blocks < max_sync)
- max_sync = sync_blocks;
- if (!must_sync &&
- !conf->fullsync) {
- /* yep, skip the sync_blocks here, but don't assume
- * that there will never be anything to do here
- */
- chunks_skipped = -1;
- continue;
- }
+ for (i=0 ; i<conf->raid_disks; i++) {
+ int still_degraded;
+ r10bio_t *rb2;
+ sector_t sect;
+ int must_sync;
- r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
- raise_barrier(conf, rb2 != NULL);
- atomic_set(&r10_bio->remaining, 0);
+ if (conf->mirrors[i].rdev == NULL ||
+ test_bit(In_sync, &conf->mirrors[i].rdev->flags))
+ continue;
- r10_bio->master_bio = (struct bio*)rb2;
- if (rb2)
- atomic_inc(&rb2->remaining);
- r10_bio->mddev = mddev;
- set_bit(R10BIO_IsRecover, &r10_bio->state);
- r10_bio->sector = sect;
+ still_degraded = 0;
+ /* want to reconstruct this device */
+ rb2 = r10_bio;
+ sect = raid10_find_virt(conf, sector_nr, i);
+ /* Unless we are doing a full sync, we only need
+ * to recover the block if it is set in the bitmap
+ */
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, 1);
+ if (sync_blocks < max_sync)
+ max_sync = sync_blocks;
+ if (!must_sync &&
+ !conf->fullsync) {
+ /* yep, skip the sync_blocks here, but don't assume
+ * that there will never be anything to do here
+ */
+ chunks_skipped = -1;
+ continue;
+ }
- raid10_find_phys(conf, r10_bio);
+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ raise_barrier(conf, rb2 != NULL);
+ atomic_set(&r10_bio->remaining, 0);
- /* Need to check if the array will still be
- * degraded
- */
- for (j=0; j<conf->raid_disks; j++)
- if (conf->mirrors[j].rdev == NULL ||
- test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
- still_degraded = 1;
- break;
- }
-
- must_sync = bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, still_degraded);
-
- for (j=0; j<conf->copies;j++) {
- int d = r10_bio->devs[j].devnum;
- if (conf->mirrors[d].rdev &&
- test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
- /* This is where we read from */
- bio = r10_bio->devs[0].bio;
- bio->bi_next = biolist;
- biolist = bio;
- bio->bi_private = r10_bio;
- bio->bi_end_io = end_sync_read;
- bio->bi_rw = READ;
- bio->bi_sector = r10_bio->devs[j].addr +
- conf->mirrors[d].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[d].rdev->bdev;
- atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- atomic_inc(&r10_bio->remaining);
- /* and we write to 'i' */
-
- for (k=0; k<conf->copies; k++)
- if (r10_bio->devs[k].devnum == i)
- break;
- BUG_ON(k == conf->copies);
- bio = r10_bio->devs[1].bio;
- bio->bi_next = biolist;
- biolist = bio;
- bio->bi_private = r10_bio;
- bio->bi_end_io = end_sync_write;
- bio->bi_rw = WRITE;
- bio->bi_sector = r10_bio->devs[k].addr +
- conf->mirrors[i].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-
- r10_bio->devs[0].devnum = d;
- r10_bio->devs[1].devnum = i;
+ r10_bio->master_bio = (struct bio*)rb2;
+ if (rb2)
+ atomic_inc(&rb2->remaining);
+ r10_bio->mddev = mddev;
+ set_bit(R10BIO_IsRecover, &r10_bio->state);
+ r10_bio->sector = sect;
- break;
- }
- }
- if (j == conf->copies) {
- /* Cannot recover, so abort the recovery */
- put_buf(r10_bio);
- if (rb2)
- atomic_dec(&rb2->remaining);
- r10_bio = rb2;
- if (!test_and_set_bit(MD_RECOVERY_INTR,
- &mddev->recovery))
- printk(KERN_INFO "md/raid10:%s: insufficient "
- "working devices for recovery.\n",
- mdname(mddev));
+ raid10_find_phys(conf, r10_bio);
+
+ /* Need to check if the array will still be
+ * degraded
+ */
+ for (j=0; j<conf->raid_disks; j++)
+ if (conf->mirrors[j].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
+ still_degraded = 1;
break;
}
+
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, still_degraded);
+
+ for (j=0; j<conf->copies;j++) {
+ int d = r10_bio->devs[j].devnum;
+ if (!conf->mirrors[d].rdev ||
+ !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
+ continue;
+ /* This is where we read from */
+ bio = r10_bio->devs[0].bio;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_read;
+ bio->bi_rw = READ;
+ bio->bi_sector = r10_bio->devs[j].addr +
+ conf->mirrors[d].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ atomic_inc(&r10_bio->remaining);
+ /* and we write to 'i' */
+
+ for (k=0; k<conf->copies; k++)
+ if (r10_bio->devs[k].devnum == i)
+ break;
+ BUG_ON(k == conf->copies);
+ bio = r10_bio->devs[1].bio;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = r10_bio->devs[k].addr +
+ conf->mirrors[i].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+
+ r10_bio->devs[0].devnum = d;
+ r10_bio->devs[1].devnum = i;
+
+ break;
+ }
+ if (j == conf->copies) {
+ /* Cannot recover, so abort the recovery */
+ put_buf(r10_bio);
+ if (rb2)
+ atomic_dec(&rb2->remaining);
+ r10_bio = rb2;
+ if (!test_and_set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery))
+ printk(KERN_INFO "md/raid10:%s: insufficient "
+ "working devices for recovery.\n",
+ mdname(mddev));
+ break;
}
+ }
if (biolist == NULL) {
while (r10_bio) {
r10bio_t *rb2 = r10_bio;
@@ -1978,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
- !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
+ &mddev->recovery)) {
/* We can skip this block */
*skipped = 1;
return sync_blocks + sectors_skipped;
@@ -2023,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0; i<conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
if (r10_bio->devs[i].bio->bi_end_io)
- rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ rdev_dec_pending(conf->mirrors[d].rdev,
+ mddev);
}
put_buf(r10_bio);
biolist = NULL;
@@ -2048,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
do {
struct page *page;
int len = PAGE_SIZE;
- disk = 0;
if (sector_nr + (len>>9) > max_sector)
len = (max_sector - sector_nr) << 9;
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
+ struct bio *bio2;
page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0) == 0) {
- /* stop here */
- struct bio *bio2;
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
- /* remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_size -= len;
- bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
- }
- goto bio_full;
+ if (bio_add_page(bio, page, len, 0))
+ continue;
+
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ for (bio2 = biolist;
+ bio2 && bio2 != bio;
+ bio2 = bio2->bi_next) {
+ /* remove last page from this bio */
+ bio2->bi_vcnt--;
+ bio2->bi_size -= len;
+ bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
}
- disk = i;
+ goto bio_full;
}
nr_sectors += len>>9;
sector_nr += len>>9;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2316ac2e8e2..944b1104d3b 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -17,8 +17,8 @@ struct r10_private_data_s {
spinlock_t device_lock;
/* geometry */
- int near_copies; /* number of copies layed out raid0 style */
- int far_copies; /* number of copies layed out
+ int near_copies; /* number of copies laid out raid0 style */
+ int far_copies; /* number of copies laid out
* at large strides across drives
*/
int far_offset; /* far_copies are offset by 1 stripe
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e867ee42b15..346e69bfdab 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
*
* We group bitmap updates into batches. Each batch has a number.
* We may write out several batches at once, but that isn't very important.
- * conf->bm_write is the number of the last batch successfully written.
- * conf->bm_flush is the number of the last batch that was closed to
+ * conf->seq_write is the number of the last batch successfully written.
+ * conf->seq_flush is the number of the last batch that was closed to
* new additions.
* When we discover that we will need to write to any block in a stripe
* (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * the number of the batch it will be in. This is seq_flush+1.
* When we are ready to do a write, if that batch hasn't been written yet,
* we plug the array and queue the stripe for later.
* When an unplug happens, we increment bm_flush, thus closing the current
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state)) {
+ if (test_bit(STRIPE_DELAYED, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- plugger_set_plug(&conf->plug);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- sh->bm_seq - conf->seq_write > 0) {
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
- plugger_set_plug(&conf->plug);
- } else {
+ else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
@@ -461,7 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
- md_raid5_kick_device(conf));
+ );
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
@@ -1470,7 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list),
conf->device_lock,
- blk_flush_plug(current));
+ );
osh = get_free_stripe(conf);
spin_unlock_irq(&conf->device_lock);
atomic_set(&nsh->count, 1);
@@ -1702,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
raid5_conf_t *conf = mddev->private;
pr_debug("raid456: error called\n");
- if (!test_bit(Faulty, &rdev->flags)) {
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery was running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- }
- set_bit(Faulty, &rdev->flags);
- printk(KERN_ALERT
- "md/raid:%s: Disk failure on %s, disabling device.\n"
- "md/raid:%s: Operation continuing on %d devices.\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- mdname(mddev),
- conf->raid_disks - mddev->degraded);
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * if recovery was running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT
+ "md/raid:%s: Disk failure on %s, disabling device.\n"
+ "md/raid:%s: Operation continuing on %d devices.\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ mdname(mddev),
+ conf->raid_disks - mddev->degraded);
}
/*
@@ -3623,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
atomic_inc(&conf->preread_active_stripes);
list_add_tail(&sh->lru, &conf->hold_list);
}
- } else
- plugger_set_plug(&conf->plug);
+ }
}
static void activate_bit_delay(raid5_conf_t *conf)
@@ -3641,21 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
}
}
-void md_raid5_kick_device(raid5_conf_t *conf)
-{
- blk_flush_plug(current);
- raid5_activate_delayed(conf);
- md_wakeup_thread(conf->mddev->thread);
-}
-EXPORT_SYMBOL_GPL(md_raid5_kick_device);
-
-static void raid5_unplug(struct plug_handle *plug)
-{
- raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
-
- md_raid5_kick_device(conf);
-}
-
int md_raid5_congested(mddev_t *mddev, int bits)
{
raid5_conf_t *conf = mddev->private;
@@ -3945,6 +3925,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
int remaining;
+ int plugged;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bi);
@@ -3963,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ plugged = mddev_check_plugged(mddev);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
int disks, data_disks;
@@ -3976,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
/* spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
* possible to see a half-updated value
- * Ofcourse reshape_progress could change after
+ * Of course reshape_progress could change after
* the lock is dropped, so once we get a reference
* to the stripe that we think it is, we will have
* to check again.
@@ -4057,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
* add failed due to overlap. Flush everything
* and wait a while
*/
- md_raid5_kick_device(conf);
+ md_wakeup_thread(mddev->thread);
release_stripe(sh);
schedule();
goto retry;
@@ -4077,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
}
}
+ if (!plugged)
+ md_wakeup_thread(mddev->thread);
+
spin_lock_irq(&conf->device_lock);
remaining = raid5_dec_bi_phys_segments(bi);
spin_unlock_irq(&conf->device_lock);
@@ -4478,24 +4463,30 @@ static void raid5d(mddev_t *mddev)
struct stripe_head *sh;
raid5_conf_t *conf = mddev->private;
int handled;
+ struct blk_plug plug;
pr_debug("+++ raid5d active\n");
md_check_recovery(mddev);
+ blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
- if (conf->seq_flush != conf->seq_write) {
- int seq = conf->seq_flush;
+ if (atomic_read(&mddev->plug_cnt) == 0 &&
+ !list_empty(&conf->bitmap_list)) {
+ /* Now is a good time to flush some bitmap updates */
+ conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
bitmap_unplug(mddev->bitmap);
spin_lock_irq(&conf->device_lock);
- conf->seq_write = seq;
+ conf->seq_write = conf->seq_flush;
activate_bit_delay(conf);
}
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ raid5_activate_delayed(conf);
while ((bio = remove_bio_from_retry(conf))) {
int ok;
@@ -4525,6 +4516,7 @@ static void raid5d(mddev_t *mddev)
spin_unlock_irq(&conf->device_lock);
async_tx_issue_pending_all();
+ blk_finish_plug(&plug);
pr_debug("--- raid5d inactive\n");
}
@@ -5141,8 +5133,6 @@ static int run(mddev_t *mddev)
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
- plugger_init(&conf->plug, raid5_unplug);
- mddev->plug = &conf->plug;
if (mddev->queue) {
int chunk_size;
/* read-ahead size must cover two whole stripes, which
@@ -5159,7 +5149,6 @@ static int run(mddev_t *mddev)
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested;
- mddev->queue->queue_lock = &conf->device_lock;
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
@@ -5192,7 +5181,6 @@ static int stop(mddev_t *mddev)
mddev->thread = NULL;
if (mddev->queue)
mddev->queue->backing_dev_info.congested_fn = NULL;
- plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
free_conf(conf);
mddev->private = NULL;
mddev->to_remove = &raid5_attrs_group;
@@ -5401,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
- if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
@@ -5688,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
static void *raid45_takeover_raid0(mddev_t *mddev, int level)
{
struct raid0_private_data *raid0_priv = mddev->private;
+ sector_t sectors;
/* for raid0 takeover only one zone is supported */
if (raid0_priv->nr_strip_zones > 1) {
@@ -5696,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
return ERR_PTR(-EINVAL);
}
+ sectors = raid0_priv->strip_zone[0].zone_end;
+ sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
+ mddev->dev_sectors = sectors;
mddev->new_level = level;
mddev->new_layout = ALGORITHM_PARITY_N;
mddev->new_chunk_sectors = mddev->chunk_sectors;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8d563a4f022..3ca77a2613b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -400,8 +400,6 @@ struct raid5_private_data {
* Cleared when a sync completes.
*/
- struct plug_handle plug;
-
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */