diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
commit | bcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch) | |
tree | e420679a5db6ea4e1694eef57f9abb6acac8d4d3 /drivers/md/raid1.c | |
parent | 26198c21d1b286a084fe5d514a30bc7e6c712a34 (diff) | |
parent | 000078bc3ee69efb1124b8478c7527389a826074 (diff) |
Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:
* Fix include order for bison/flex-generated C files, from Ben Hutchings
* Build fixes and documentation corrections from David Ahern
* Group parsing support, from Jiri Olsa
* UI/gtk refactorings and improvements from Namhyung Kim
* NULL deref fix for perf script, from Namhyung Kim
* Assorted cleanups from Robert Richter
* Let O= makes handle relative paths, from Steven Rostedt
* perf script python fixes, from Feng Tang.
* Improve 'perf lock' error message when the needed tracepoints
are not present, from David Ahern.
* Initial bash completion support, from Frederic Weisbecker
* Allow building without libelf, from Namhyung Kim.
* Support DWARF CFI based unwind to have callchains when %bp
based unwinding is not possible, from Jiri Olsa.
* Symbol resolution fixes, while fixing support PPC64 files with an .opt ELF
section was the end goal, several fixes for code that handles all
architectures and cleanups are included, from Cody Schafer.
* Add a description for the JIT interface, from Andi Kleen.
* Assorted fixes for Documentation and build in 32 bit, from Robert Richter
* Add support for non-tracepoint events in perf script python, from Feng Tang
* Cache the libtraceevent event_format associated to each evsel early, so that we
avoid relookups, i.e. calling pevent_find_event repeatedly when processing
tracepoint events.
[ This is to reduce the surface contact with libtraceevents and make clear what
is that the perf tools needs from that lib: so far parsing the common and per
event fields. ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 224 |
1 files changed, 176 insertions, 48 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d686..611b5f79761 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,6 +46,20 @@ */ #define NR_RAID1_BIOS 256 +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + /* When there are this many requests queue to be written by * the raid1 thread, we become 'congested' to provide back-pressure * for writeback. @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect const sector_t this_sector = r1_bio->sector; int sectors; int best_good_sectors; - int start_disk; - int best_disk; - int i; + int best_disk, best_dist_disk, best_pending_disk; + int has_nonrot_disk; + int disk; sector_t best_dist; + unsigned int min_pending; struct md_rdev *rdev; int choose_first; + int choose_next_idle; rcu_read_lock(); /* @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect retry: sectors = r1_bio->sectors; best_disk = -1; + best_dist_disk = -1; best_dist = MaxSector; + best_pending_disk = -1; + min_pending = UINT_MAX; best_good_sectors = 0; + has_nonrot_disk = 0; + choose_next_idle = 0; if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { + (this_sector + sectors >= conf->next_resync)) choose_first = 1; - start_disk = 0; - } else { + else choose_first = 0; - start_disk = conf->last_used; - } - for (i = 0 ; i < conf->raid_disks * 2 ; i++) { + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; sector_t first_bad; int bad_sectors; - - int disk = start_disk + i; - if (disk >= conf->raid_disks * 2) - disk -= conf->raid_disks * 2; + unsigned int pending; + bool nonrot; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } else best_good_sectors = sectors; + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first - /* Don't change to another disk for sequential reads */ - || conf->next_seq_sect == this_sector - || dist == 0 - /* If device is idle, use it */ - || atomic_read(&rdev->nr_pending) == 0) { + if (choose_first) { + best_disk = disk; + break; + } + /* Don't change to another disk for sequential reads */ + if (conf->mirrors[disk].next_seq_sect == this_sector + || dist == 0) { + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; + struct raid1_info *mirror = &conf->mirrors[disk]; + + best_disk = disk; + /* + * If buffered sequential IO size exceeds optimal + * iosize, check if there is idle disk. If yes, choose + * the idle disk. read_balance could already choose an + * idle disk before noticing it's a sequential IO in + * this disk. This doesn't matter because this disk + * will idle, next time it will be utilized after the + * first disk has IO size exceeds optimal iosize. In + * this way, iosize of the first disk will be optimal + * iosize at least. iosize of the second disk might be + * small, but not a big deal since when the second disk + * starts IO, the first disk is likely still busy. + */ + if (nonrot && opt_iosize > 0 && + mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= + mirror->seq_start) { + choose_next_idle = 1; + continue; + } + break; + } + /* If device is idle, use it */ + if (pending == 0) { best_disk = disk; break; } + + if (choose_next_idle) + continue; + + if (min_pending > pending) { + min_pending = pending; + best_pending_disk = disk; + } + if (dist < best_dist) { best_dist = dist; - best_disk = disk; + best_dist_disk = disk; } } + /* + * If all disks are rotational, choose the closest disk. If any disk is + * non-rotational, choose the disk with less pending request even the + * disk is rotational, which might/might not be optimal for raids with + * mixed ratation/non-rotational disks depending on workload. + */ + if (best_disk == -1) { + if (has_nonrot_disk) + best_disk = best_pending_disk; + else + best_disk = best_dist_disk; + } + if (best_disk >= 0) { rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect goto retry; } sectors = best_good_sectors; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = best_disk; + + if (conf->mirrors[best_disk].next_seq_sect != this_sector) + conf->mirrors[best_disk].seq_start = this_sector; + + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } rcu_read_unlock(); *max_sectors = sectors; @@ -870,10 +944,48 @@ do_sync_io: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; - struct mirror_info *mirror; + struct raid1_info *mirror; struct r1bio *r1_bio; struct bio *read_bio; int i, disks; @@ -883,6 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; @@ -1185,11 +1299,22 @@ read_again: mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); + + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid1_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, @@ -1364,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = -EEXIST; int mirror = 0; - struct mirror_info *p; + struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1433,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct mirror_info *p = conf->mirrors+ number; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) p = conf->mirrors + conf->raid_disks + number; @@ -2173,8 +2298,7 @@ static void raid1d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -2371,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio->bi_rw = READ; bio->bi_end_io = end_sync_read; read_targets++; + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + /* + * The device is suitable for reading (InSync), + * but has bad block(s) here. Let's try to correct them, + * if we are doing resync or repair. Otherwise, leave + * this device alone for this sync request. + */ + bio->bi_rw = WRITE; + bio->bi_end_io = end_sync_write; + write_targets++; } } if (bio->bi_end_io) { @@ -2428,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ - sector_t rv = max_sector - sector_nr; + sector_t rv; + if (min_bad > 0) + max_sector = sector_nr + min_bad; + rv = max_sector - sector_nr; *skipped = 1; put_buf(r1_bio); return rv; @@ -2521,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) { struct r1conf *conf; int i; - struct mirror_info *disk; + struct raid1_info *disk; struct md_rdev *rdev; int err = -ENOMEM; @@ -2529,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info) + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) @@ -2572,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) mddev->merge_check_needed = 1; disk->head_position = 0; + disk->seq_start = MaxSector; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; @@ -2585,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; - conf->last_used = -1; for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; @@ -2611,19 +2750,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev && (disk->rdev->saved_raid_disk < 0)) conf->fullsync = 1; - } else if (conf->last_used < 0) - /* - * The first working device is used as a - * starting point to read balancing. - */ - conf->last_used = i; + } } - if (conf->last_used < 0) { - printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", - mdname(mddev)); - goto abort; - } err = -ENOMEM; conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { @@ -2798,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev) */ mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; - struct mirror_info *newmirrors; + struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; @@ -2841,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, + newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); @@ -2880,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - conf->last_used = 0; /* just make sure it is in-range */ lower_barrier(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |