diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 58 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 2 | ||||
-rw-r--r-- | drivers/md/kcopyd.c | 10 | ||||
-rw-r--r-- | drivers/md/kcopyd.h | 4 | ||||
-rw-r--r-- | drivers/md/md.c | 64 | ||||
-rw-r--r-- | drivers/md/raid1.c | 73 | ||||
-rw-r--r-- | drivers/md/raid10.c | 87 | ||||
-rw-r--r-- | drivers/md/raid5.c | 57 |
11 files changed, 245 insertions, 146 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7aeceedcf7d..c14dacdacfa 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1045,8 +1045,14 @@ void bitmap_daemon_work(struct bitmap *bitmap) if (bitmap == NULL) return; if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) - return; + goto done; + bitmap->daemon_lastrun = jiffies; + if (bitmap->allclean) { + bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + return; + } + bitmap->allclean = 1; for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; @@ -1068,8 +1074,10 @@ void bitmap_daemon_work(struct bitmap *bitmap) clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - if (need_write) + if (need_write) { write_page(bitmap, page, 0); + bitmap->allclean = 0; + } continue; } @@ -1098,6 +1106,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) /* if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); */ + if (*bmc) + bitmap->allclean = 0; + if (*bmc == 2) { *bmc=1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); @@ -1132,6 +1143,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) } } + done: + if (bitmap->allclean == 0) + bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, @@ -1226,6 +1240,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect sectors -= blocks; else sectors = 0; } + bitmap->allclean = 0; return 0; } @@ -1296,6 +1311,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, } } spin_unlock_irq(&bitmap->lock); + bitmap->allclean = 0; return rv; } @@ -1332,6 +1348,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int ab } unlock: spin_unlock_irqrestore(&bitmap->lock, flags); + bitmap->allclean = 0; } void bitmap_close_sync(struct bitmap *bitmap) @@ -1399,7 +1416,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } spin_unlock_irq(&bitmap->lock); - + bitmap->allclean = 0; } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b04f98df94e..835def11419 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -93,6 +93,8 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; + wait_queue_head_t writeq; + /* * crypto related data */ @@ -331,14 +333,7 @@ static void crypt_convert_init(struct crypt_config *cc, ctx->idx_out = bio_out ? bio_out->bi_idx : 0; ctx->sector = sector + cc->iv_offset; init_completion(&ctx->restart); - /* - * Crypto operation can be asynchronous, - * ctx->pending is increased after request submission. - * We need to ensure that we don't call the crypt finish - * operation before pending got incremented - * (dependent on crypt submission return code). - */ - atomic_set(&ctx->pending, 2); + atomic_set(&ctx->pending, 1); } static int crypt_convert_block(struct crypt_config *cc, @@ -411,43 +406,42 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { - int r = 0; + int r; while(ctx->idx_in < ctx->bio_in->bi_vcnt && ctx->idx_out < ctx->bio_out->bi_vcnt) { crypt_alloc_req(cc, ctx); + atomic_inc(&ctx->pending); + r = crypt_convert_block(cc, ctx, cc->req); switch (r) { + /* async */ case -EBUSY: wait_for_completion(&ctx->restart); INIT_COMPLETION(ctx->restart); /* fall through*/ case -EINPROGRESS: - atomic_inc(&ctx->pending); cc->req = NULL; - r = 0; - /* fall through*/ + ctx->sector++; + continue; + + /* sync */ case 0: + atomic_dec(&ctx->pending); ctx->sector++; continue; - } - break; + /* error */ + default: + atomic_dec(&ctx->pending); + return r; + } } - /* - * If there are pending crypto operation run async - * code. Otherwise process return code synchronously. - * The step of 2 ensures that async finish doesn't - * call crypto finish too early. - */ - if (atomic_sub_return(2, &ctx->pending)) - return -EINPROGRESS; - - return r; + return 0; } static void dm_crypt_bio_destructor(struct bio *bio) @@ -624,8 +618,10 @@ static void kcryptd_io_read(struct dm_crypt_io *io) static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; + struct crypt_config *cc = io->target->private; generic_make_request(clone); + wake_up(&cc->writeq); } static void kcryptd_io(struct work_struct *work) @@ -698,7 +694,8 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) r = crypt_convert(cc, &io->ctx); - if (r != -EINPROGRESS) { + if (atomic_dec_and_test(&io->ctx.pending)) { + /* processed, no running async crypto */ kcryptd_crypt_write_io_submit(io, r, 0); if (unlikely(r < 0)) return; @@ -706,8 +703,12 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) atomic_inc(&io->pending); /* out of memory -> run queues */ - if (unlikely(remaining)) + if (unlikely(remaining)) { + /* wait for async crypto then reinitialize pending */ + wait_event(cc->writeq, !atomic_read(&io->ctx.pending)); + atomic_set(&io->ctx.pending, 1); congestion_wait(WRITE, HZ/100); + } } } @@ -746,7 +747,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) r = crypt_convert(cc, &io->ctx); - if (r != -EINPROGRESS) + if (atomic_dec_and_test(&io->ctx.pending)) kcryptd_crypt_read_done(io, r); crypt_dec_pending(io); @@ -1047,6 +1048,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_crypt_queue; } + init_waitqueue_head(&cc->writeq); ti->private = cc; return 0; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b8e342fe758..8f25f628ef1 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -114,7 +114,7 @@ static void dec_count(struct io *io, unsigned int region, int error) wake_up_process(io->sleeper); else { - int r = io->error; + unsigned long r = io->error; io_notify_fn fn = io->callback; void *context = io->context; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 2928ef22810..762cb086bb7 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -753,7 +753,7 @@ out: * are in the no-sync state. We have to recover these by * recopying from the default mirror to all the others. *---------------------------------------------------------------*/ -static void recovery_complete(int read_err, unsigned int write_err, +static void recovery_complete(int read_err, unsigned long write_err, void *context) { struct region *reg = (struct region *)context; @@ -767,7 +767,7 @@ static void recovery_complete(int read_err, unsigned int write_err, } if (write_err) { - DMERR_LIMIT("Write error during recovery (error = 0x%x)", + DMERR_LIMIT("Write error during recovery (error = 0x%lx)", write_err); /* * Bits correspond to devices (excluding default mirror). @@ -1695,14 +1695,15 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * information for a retry or there was no other * mirror in-sync. */ - DMERR_LIMIT("Mirror read failed from %s.", - m->dev->name); + DMERR_LIMIT("Mirror read failed."); return -EIO; } + + m = read_record->m; + DMERR("Mirror read failed from %s. Trying alternative device.", m->dev->name); - m = read_record->m; fail_mirror(m, DM_RAID1_READ_ERROR); /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ae24eab8cd8..4dc8a43c034 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -804,7 +804,7 @@ static void commit_callback(void *context, int success) * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. */ -static void copy_callback(int read_err, unsigned int write_err, void *context) +static void copy_callback(int read_err, unsigned long write_err, void *context) { struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index f3831f31223..e76b52ade69 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -169,7 +169,7 @@ struct kcopyd_job { * Error state of the job. */ int read_err; - unsigned int write_err; + unsigned long write_err; /* * Either READ or WRITE @@ -293,7 +293,7 @@ static int run_complete_job(struct kcopyd_job *job) { void *context = job->context; int read_err = job->read_err; - unsigned int write_err = job->write_err; + unsigned long write_err = job->write_err; kcopyd_notify_fn fn = job->fn; struct kcopyd_client *kc = job->kc; @@ -396,7 +396,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) if (r < 0) { /* error this rogue job */ if (job->rw == WRITE) - job->write_err = (unsigned int) -1; + job->write_err = (unsigned long) -1L; else job->read_err = 1; push(&_complete_jobs, job); @@ -448,8 +448,8 @@ static void dispatch_job(struct kcopyd_job *job) } #define SUB_JOB_SIZE 128 -static void segment_complete(int read_err, - unsigned int write_err, void *context) +static void segment_complete(int read_err, unsigned long write_err, + void *context) { /* FIXME: tidy this function */ sector_t progress = 0; diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h index 4621ea055c0..4845f2a0c67 100644 --- a/drivers/md/kcopyd.h +++ b/drivers/md/kcopyd.h @@ -32,8 +32,8 @@ void kcopyd_client_destroy(struct kcopyd_client *kc); * read_err is a boolean, * write_err is a bitset, with 1 bit for each destination region */ -typedef void (*kcopyd_notify_fn)(int read_err, - unsigned int write_err, void *context); +typedef void (*kcopyd_notify_fn)(int read_err, unsigned long write_err, + void *context); int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, unsigned int num_dests, struct io_region *dests, diff --git a/drivers/md/md.c b/drivers/md/md.c index 7da6ec244e1..5ebfb4d7990 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1105,7 +1105,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) - rdev-> sb_size = (rdev->sb_size | bmask)+1; + rdev->sb_size = (rdev->sb_size | bmask) + 1; + + if (minor_version + && rdev->data_offset < sb_offset + (rdev->sb_size/512)) + return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; @@ -1137,7 +1141,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else ret = 0; } - if (minor_version) + if (minor_version) rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; else rdev->size = rdev->sb_offset; @@ -1499,7 +1503,8 @@ static void export_rdev(mdk_rdev_t * rdev) free_disk_sb(rdev); list_del_init(&rdev->same_set); #ifndef MODULE - md_autodetect_dev(rdev->bdev->bd_dev); + if (test_bit(AutoDetected, &rdev->flags)) + md_autodetect_dev(rdev->bdev->bd_dev); #endif unlock_rdev(rdev); kobject_put(&rdev->kobj); @@ -1859,17 +1864,6 @@ static struct rdev_sysfs_entry rdev_state = __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t -super_show(mdk_rdev_t *rdev, char *page) -{ - if (rdev->sb_loaded && rdev->sb_size) { - memcpy(page, page_address(rdev->sb_page), rdev->sb_size); - return rdev->sb_size; - } else - return 0; -} -static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); - -static ssize_t errors_show(mdk_rdev_t *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); @@ -1996,9 +1990,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) char *e; unsigned long long size = simple_strtoull(buf, &e, 10); unsigned long long oldsize = rdev->size; + mddev_t *my_mddev = rdev->mddev; + if (e==buf || (*e && *e != '\n')) return -EINVAL; - if (rdev->mddev->pers) + if (my_mddev->pers) return -EBUSY; rdev->size = size; if (size > oldsize && rdev->mddev->external) { @@ -2011,7 +2007,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) int overlap = 0; struct list_head *tmp, *tmp2; - mddev_unlock(rdev->mddev); + mddev_unlock(my_mddev); for_each_mddev(mddev, tmp) { mdk_rdev_t *rdev2; @@ -2031,7 +2027,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) break; } } - mddev_lock(rdev->mddev); + mddev_lock(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. @@ -2043,8 +2039,8 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EBUSY; } } - if (size < rdev->mddev->size || rdev->mddev->size == 0) - rdev->mddev->size = size; + if (size < my_mddev->size || my_mddev->size == 0) + my_mddev->size = size; return len; } @@ -2053,7 +2049,6 @@ __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, - &rdev_super.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, @@ -2065,10 +2060,21 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + mddev_t *mddev = rdev->mddev; + ssize_t rv; if (!entry->show) return -EIO; - return entry->show(rdev, page); + + rv = mddev ? mddev_lock(mddev) : -EBUSY; + if (!rv) { + if (rdev->mddev == NULL) + rv = -EBUSY; + else + rv = entry->show(rdev, page); + mddev_unlock(mddev); + } + return rv; } static ssize_t @@ -2077,15 +2083,19 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); - int rv; + ssize_t rv; + mddev_t *mddev = rdev->mddev; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - rv = mddev_lock(rdev->mddev); + rv = mddev ? mddev_lock(mddev): -EBUSY; if (!rv) { - rv = entry->store(rdev, page, length); + if (rdev->mddev == NULL) + rv = -EBUSY; + else + rv = entry->store(rdev, page, length); mddev_unlock(rdev->mddev); } return rv; @@ -4142,7 +4152,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) return 0; busy: - printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; } @@ -5127,7 +5137,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->ro==1) seq_printf(seq, " (read-only)"); if (mddev->ro==2) - seq_printf(seq, "(auto-read-only)"); + seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); } @@ -5351,6 +5361,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) mddev->ro = 0; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); } atomic_inc(&mddev->writes_pending); if (mddev->in_sync) { @@ -6021,6 +6032,7 @@ static void autostart_arrays(int part) MD_BUG(); continue; } + set_bit(AutoDetected, &rdev->flags); list_add(&rdev->same_set, &pending_raid_disks); i_passed++; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5c7fef091ce..ff61b309129 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -592,6 +592,37 @@ static int raid1_congested(void *data, int bits) } +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -673,15 +704,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid1_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -907,6 +946,9 @@ static int make_request(struct request_queue *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid1d snuck into freeze_array */ + wake_up(&conf->wait_barrier); + if (do_sync) md_wakeup_thread(mddev->thread); #if 0 @@ -1473,28 +1515,14 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - bitmap_unplug(mddev->bitmap); - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; + unplug += flush_pending_writes(conf); - continue; - } - - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); conf->nr_queued--; @@ -1590,7 +1618,6 @@ static void raid1d(mddev_t *mddev) } } } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 017f58113c3..32389d2f18f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -537,7 +537,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) current_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); - /* Find the disk whose head is closest */ + /* Find the disk whose head is closest, + * or - for far > 1 - find the closest to partition beginning */ for (nslot = slot; nslot < conf->copies; nslot++) { int ndisk = r10_bio->devs[nslot].devnum; @@ -557,8 +558,13 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) slot = nslot; break; } - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); + + /* for far > 1 always use the lowest address */ + if (conf->far_copies > 1) + new_distance = r10_bio->devs[nslot].addr; + else + new_distance = abs(r10_bio->devs[nslot].addr - + conf->mirrors[ndisk].head_position); if (new_distance < current_distance) { current_distance = new_distance; disk = ndisk; @@ -629,7 +635,36 @@ static int raid10_congested(void *data, int bits) return ret; } - +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to disk + * before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -712,15 +747,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid10_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } @@ -892,6 +935,9 @@ static int make_request(struct request_queue *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid10d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); + if (do_sync) md_wakeup_thread(mddev->thread); @@ -1464,28 +1510,14 @@ static void raid10d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - bitmap_unplug(mddev->bitmap); - - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - - continue; - } + unplug += flush_pending_writes(conf); - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r10_bio = list_entry(head->prev, r10bio_t, retry_list); list_del(head->prev); conf->nr_queued--; @@ -1548,7 +1580,6 @@ static void raid10d(mddev_t *mddev) } } } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1787,6 +1818,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (j == conf->copies) { /* Cannot recover, so abort the recovery */ put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); r10_bio = rb2; if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2d6f1a51359..b162b839a66 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1143,7 +1143,7 @@ static void raid5_end_read_request(struct bio * bi, int error) rdev = conf->disks[i].rdev; printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)sh->sector + rdev->data_offset, + (unsigned long long)(sh->sector + rdev->data_offset), bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); @@ -1160,13 +1160,13 @@ static void raid5_end_read_request(struct bio * bi, int error) if (conf->mddev->degraded) printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)sh->sector + rdev->data_offset, + (unsigned long long)(sh->sector + rdev->data_offset), bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)sh->sector + rdev->data_offset, + (unsigned long long)(sh->sector + rdev->data_offset), bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) @@ -2348,25 +2348,15 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { + int canceled_check = 0; + set_bit(STRIPE_HANDLE, &sh->state); - /* Take one of the following actions: - * 1/ start a check parity operation if (uptodate == disks) - * 2/ finish a check parity operation and act on the result - * 3/ skip to the writeback section if we previously - * initiated a recovery operation - */ - if (s->failed == 0 && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - BUG_ON(s->uptodate != disks); - clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; - s->uptodate--; - } else if ( - test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + /* complete a check operation */ + if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + if (s->failed == 0) { if (sh->ops.zero_sum_result == 0) /* parity is correct (on disc, * not in buffer any more) @@ -2391,7 +2381,8 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, s->uptodate++; } } - } + } else + canceled_check = 1; /* STRIPE_INSYNC is not set */ } /* check if we can clear a parity disk reconstruct */ @@ -2404,12 +2395,28 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); } + /* start a new check operation if there are no failures, the stripe is + * not insync, and a repair is not in flight + */ + if (s->failed == 0 && + !test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(s->uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + s->uptodate--; + } + } + /* Wait for check parity and compute block operations to complete - * before write-back + * before write-back. If a failure occurred while the check operation + * was in flight we need to cycle this stripe through handle_stripe + * since the parity block may not be uptodate */ - if (!test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { + if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { struct r5dev *dev; /* either failed parity check, or recovery is happening */ if (s->failed == 0) |