diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 234 |
1 files changed, 184 insertions, 50 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 240ff312504..611b5f79761 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,6 +46,20 @@ */ #define NR_RAID1_BIOS 256 +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + /* When there are this many requests queue to be written by * the raid1 thread, we become 'congested' to provide back-pressure * for writeback. @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect const sector_t this_sector = r1_bio->sector; int sectors; int best_good_sectors; - int start_disk; - int best_disk; - int i; + int best_disk, best_dist_disk, best_pending_disk; + int has_nonrot_disk; + int disk; sector_t best_dist; + unsigned int min_pending; struct md_rdev *rdev; int choose_first; + int choose_next_idle; rcu_read_lock(); /* @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect retry: sectors = r1_bio->sectors; best_disk = -1; + best_dist_disk = -1; best_dist = MaxSector; + best_pending_disk = -1; + min_pending = UINT_MAX; best_good_sectors = 0; + has_nonrot_disk = 0; + choose_next_idle = 0; if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { + (this_sector + sectors >= conf->next_resync)) choose_first = 1; - start_disk = 0; - } else { + else choose_first = 0; - start_disk = conf->last_used; - } - for (i = 0 ; i < conf->raid_disks * 2 ; i++) { + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; sector_t first_bad; int bad_sectors; - - int disk = start_disk + i; - if (disk >= conf->raid_disks * 2) - disk -= conf->raid_disks * 2; + unsigned int pending; + bool nonrot; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } else best_good_sectors = sectors; + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first - /* Don't change to another disk for sequential reads */ - || conf->next_seq_sect == this_sector - || dist == 0 - /* If device is idle, use it */ - || atomic_read(&rdev->nr_pending) == 0) { + if (choose_first) { + best_disk = disk; + break; + } + /* Don't change to another disk for sequential reads */ + if (conf->mirrors[disk].next_seq_sect == this_sector + || dist == 0) { + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; + struct raid1_info *mirror = &conf->mirrors[disk]; + + best_disk = disk; + /* + * If buffered sequential IO size exceeds optimal + * iosize, check if there is idle disk. If yes, choose + * the idle disk. read_balance could already choose an + * idle disk before noticing it's a sequential IO in + * this disk. This doesn't matter because this disk + * will idle, next time it will be utilized after the + * first disk has IO size exceeds optimal iosize. In + * this way, iosize of the first disk will be optimal + * iosize at least. iosize of the second disk might be + * small, but not a big deal since when the second disk + * starts IO, the first disk is likely still busy. + */ + if (nonrot && opt_iosize > 0 && + mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= + mirror->seq_start) { + choose_next_idle = 1; + continue; + } + break; + } + /* If device is idle, use it */ + if (pending == 0) { best_disk = disk; break; } + + if (choose_next_idle) + continue; + + if (min_pending > pending) { + min_pending = pending; + best_pending_disk = disk; + } + if (dist < best_dist) { best_dist = dist; - best_disk = disk; + best_dist_disk = disk; } } + /* + * If all disks are rotational, choose the closest disk. If any disk is + * non-rotational, choose the disk with less pending request even the + * disk is rotational, which might/might not be optimal for raids with + * mixed ratation/non-rotational disks depending on workload. + */ + if (best_disk == -1) { + if (has_nonrot_disk) + best_disk = best_pending_disk; + else + best_disk = best_dist_disk; + } + if (best_disk >= 0) { rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect goto retry; } sectors = best_good_sectors; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = best_disk; + + if (conf->mirrors[best_disk].next_seq_sect != this_sector) + conf->mirrors[best_disk].seq_start = this_sector; + + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } rcu_read_unlock(); *max_sectors = sectors; @@ -870,10 +944,48 @@ do_sync_io: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; - struct mirror_info *mirror; + struct raid1_info *mirror; struct r1bio *r1_bio; struct bio *read_bio; int i, disks; @@ -883,6 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; @@ -1185,11 +1299,22 @@ read_again: mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); + + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid1_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, @@ -1364,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = -EEXIST; int mirror = 0; - struct mirror_info *p; + struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1433,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct mirror_info *p = conf->mirrors+ number; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) p = conf->mirrors + conf->raid_disks + number; @@ -1818,8 +1943,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) if (atomic_dec_and_test(&r1_bio->remaining)) { /* if we're here, all write(s) have completed, so clean up */ - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, 1); + } } } @@ -2167,8 +2298,7 @@ static void raid1d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -2365,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio->bi_rw = READ; bio->bi_end_io = end_sync_read; read_targets++; + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + /* + * The device is suitable for reading (InSync), + * but has bad block(s) here. Let's try to correct them, + * if we are doing resync or repair. Otherwise, leave + * this device alone for this sync request. + */ + bio->bi_rw = WRITE; + bio->bi_end_io = end_sync_write; + write_targets++; } } if (bio->bi_end_io) { @@ -2422,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ - sector_t rv = max_sector - sector_nr; + sector_t rv; + if (min_bad > 0) + max_sector = sector_nr + min_bad; + rv = max_sector - sector_nr; *skipped = 1; put_buf(r1_bio); return rv; @@ -2515,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) { struct r1conf *conf; int i; - struct mirror_info *disk; + struct raid1_info *disk; struct md_rdev *rdev; int err = -ENOMEM; @@ -2523,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info) + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) @@ -2566,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) mddev->merge_check_needed = 1; disk->head_position = 0; + disk->seq_start = MaxSector; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; @@ -2579,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; - conf->last_used = -1; for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; @@ -2605,19 +2750,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev && (disk->rdev->saved_raid_disk < 0)) conf->fullsync = 1; - } else if (conf->last_used < 0) - /* - * The first working device is used as a - * starting point to read balancing. - */ - conf->last_used = i; + } } - if (conf->last_used < 0) { - printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", - mdname(mddev)); - goto abort; - } err = -ENOMEM; conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { @@ -2792,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev) */ mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; - struct mirror_info *newmirrors; + struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; @@ -2835,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, + newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); @@ -2874,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - conf->last_used = 0; /* just make sure it is in-range */ lower_barrier(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |