summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c23
-rw-r--r--drivers/md/dm-crypt.c58
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-raid1.c11
-rw-r--r--drivers/md/dm-snap.c2
-rw-r--r--drivers/md/kcopyd.c10
-rw-r--r--drivers/md/kcopyd.h4
-rw-r--r--drivers/md/md.c64
-rw-r--r--drivers/md/raid1.c73
-rw-r--r--drivers/md/raid10.c87
-rw-r--r--drivers/md/raid5.c57
11 files changed, 245 insertions, 146 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7aeceedcf7d..c14dacdacfa 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1045,8 +1045,14 @@ void bitmap_daemon_work(struct bitmap *bitmap)
if (bitmap == NULL)
return;
if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
- return;
+ goto done;
+
bitmap->daemon_lastrun = jiffies;
+ if (bitmap->allclean) {
+ bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ return;
+ }
+ bitmap->allclean = 1;
for (j = 0; j < bitmap->chunks; j++) {
bitmap_counter_t *bmc;
@@ -1068,8 +1074,10 @@ void bitmap_daemon_work(struct bitmap *bitmap)
clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
- if (need_write)
+ if (need_write) {
write_page(bitmap, page, 0);
+ bitmap->allclean = 0;
+ }
continue;
}
@@ -1098,6 +1106,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
/*
if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
*/
+ if (*bmc)
+ bitmap->allclean = 0;
+
if (*bmc == 2) {
*bmc=1; /* maybe clear the bit next time */
set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
@@ -1132,6 +1143,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
}
}
+ done:
+ if (bitmap->allclean == 0)
+ bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ;
}
static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1226,6 +1240,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
sectors -= blocks;
else sectors = 0;
}
+ bitmap->allclean = 0;
return 0;
}
@@ -1296,6 +1311,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
}
}
spin_unlock_irq(&bitmap->lock);
+ bitmap->allclean = 0;
return rv;
}
@@ -1332,6 +1348,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int ab
}
unlock:
spin_unlock_irqrestore(&bitmap->lock, flags);
+ bitmap->allclean = 0;
}
void bitmap_close_sync(struct bitmap *bitmap)
@@ -1399,7 +1416,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
spin_unlock_irq(&bitmap->lock);
-
+ bitmap->allclean = 0;
}
/* dirty the memory and file bits for bitmap chunks "s" to "e" */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b04f98df94e..835def11419 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -93,6 +93,8 @@ struct crypt_config {
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
+ wait_queue_head_t writeq;
+
/*
* crypto related data
*/
@@ -331,14 +333,7 @@ static void crypt_convert_init(struct crypt_config *cc,
ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
ctx->sector = sector + cc->iv_offset;
init_completion(&ctx->restart);
- /*
- * Crypto operation can be asynchronous,
- * ctx->pending is increased after request submission.
- * We need to ensure that we don't call the crypt finish
- * operation before pending got incremented
- * (dependent on crypt submission return code).
- */
- atomic_set(&ctx->pending, 2);
+ atomic_set(&ctx->pending, 1);
}
static int crypt_convert_block(struct crypt_config *cc,
@@ -411,43 +406,42 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
- int r = 0;
+ int r;
while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
ctx->idx_out < ctx->bio_out->bi_vcnt) {
crypt_alloc_req(cc, ctx);
+ atomic_inc(&ctx->pending);
+
r = crypt_convert_block(cc, ctx, cc->req);
switch (r) {
+ /* async */
case -EBUSY:
wait_for_completion(&ctx->restart);
INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
- atomic_inc(&ctx->pending);
cc->req = NULL;
- r = 0;
- /* fall through*/
+ ctx->sector++;
+ continue;
+
+ /* sync */
case 0:
+ atomic_dec(&ctx->pending);
ctx->sector++;
continue;
- }
- break;
+ /* error */
+ default:
+ atomic_dec(&ctx->pending);
+ return r;
+ }
}
- /*
- * If there are pending crypto operation run async
- * code. Otherwise process return code synchronously.
- * The step of 2 ensures that async finish doesn't
- * call crypto finish too early.
- */
- if (atomic_sub_return(2, &ctx->pending))
- return -EINPROGRESS;
-
- return r;
+ return 0;
}
static void dm_crypt_bio_destructor(struct bio *bio)
@@ -624,8 +618,10 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
static void kcryptd_io_write(struct dm_crypt_io *io)
{
struct bio *clone = io->ctx.bio_out;
+ struct crypt_config *cc = io->target->private;
generic_make_request(clone);
+ wake_up(&cc->writeq);
}
static void kcryptd_io(struct work_struct *work)
@@ -698,7 +694,8 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
r = crypt_convert(cc, &io->ctx);
- if (r != -EINPROGRESS) {
+ if (atomic_dec_and_test(&io->ctx.pending)) {
+ /* processed, no running async crypto */
kcryptd_crypt_write_io_submit(io, r, 0);
if (unlikely(r < 0))
return;
@@ -706,8 +703,12 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
atomic_inc(&io->pending);
/* out of memory -> run queues */
- if (unlikely(remaining))
+ if (unlikely(remaining)) {
+ /* wait for async crypto then reinitialize pending */
+ wait_event(cc->writeq, !atomic_read(&io->ctx.pending));
+ atomic_set(&io->ctx.pending, 1);
congestion_wait(WRITE, HZ/100);
+ }
}
}
@@ -746,7 +747,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
r = crypt_convert(cc, &io->ctx);
- if (r != -EINPROGRESS)
+ if (atomic_dec_and_test(&io->ctx.pending))
kcryptd_crypt_read_done(io, r);
crypt_dec_pending(io);
@@ -1047,6 +1048,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_crypt_queue;
}
+ init_waitqueue_head(&cc->writeq);
ti->private = cc;
return 0;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b8e342fe758..8f25f628ef1 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -114,7 +114,7 @@ static void dec_count(struct io *io, unsigned int region, int error)
wake_up_process(io->sleeper);
else {
- int r = io->error;
+ unsigned long r = io->error;
io_notify_fn fn = io->callback;
void *context = io->context;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2928ef22810..762cb086bb7 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -753,7 +753,7 @@ out:
* are in the no-sync state. We have to recover these by
* recopying from the default mirror to all the others.
*---------------------------------------------------------------*/
-static void recovery_complete(int read_err, unsigned int write_err,
+static void recovery_complete(int read_err, unsigned long write_err,
void *context)
{
struct region *reg = (struct region *)context;
@@ -767,7 +767,7 @@ static void recovery_complete(int read_err, unsigned int write_err,
}
if (write_err) {
- DMERR_LIMIT("Write error during recovery (error = 0x%x)",
+ DMERR_LIMIT("Write error during recovery (error = 0x%lx)",
write_err);
/*
* Bits correspond to devices (excluding default mirror).
@@ -1695,14 +1695,15 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* information for a retry or there was no other
* mirror in-sync.
*/
- DMERR_LIMIT("Mirror read failed from %s.",
- m->dev->name);
+ DMERR_LIMIT("Mirror read failed.");
return -EIO;
}
+
+ m = read_record->m;
+
DMERR("Mirror read failed from %s. Trying alternative device.",
m->dev->name);
- m = read_record->m;
fail_mirror(m, DM_RAID1_READ_ERROR);
/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ae24eab8cd8..4dc8a43c034 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -804,7 +804,7 @@ static void commit_callback(void *context, int success)
* Called when the copy I/O has finished. kcopyd actually runs
* this code so don't block.
*/
-static void copy_callback(int read_err, unsigned int write_err, void *context)
+static void copy_callback(int read_err, unsigned long write_err, void *context)
{
struct dm_snap_pending_exception *pe = context;
struct dm_snapshot *s = pe->snap;
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index f3831f31223..e76b52ade69 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -169,7 +169,7 @@ struct kcopyd_job {
* Error state of the job.
*/
int read_err;
- unsigned int write_err;
+ unsigned long write_err;
/*
* Either READ or WRITE
@@ -293,7 +293,7 @@ static int run_complete_job(struct kcopyd_job *job)
{
void *context = job->context;
int read_err = job->read_err;
- unsigned int write_err = job->write_err;
+ unsigned long write_err = job->write_err;
kcopyd_notify_fn fn = job->fn;
struct kcopyd_client *kc = job->kc;
@@ -396,7 +396,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
if (r < 0) {
/* error this rogue job */
if (job->rw == WRITE)
- job->write_err = (unsigned int) -1;
+ job->write_err = (unsigned long) -1L;
else
job->read_err = 1;
push(&_complete_jobs, job);
@@ -448,8 +448,8 @@ static void dispatch_job(struct kcopyd_job *job)
}
#define SUB_JOB_SIZE 128
-static void segment_complete(int read_err,
- unsigned int write_err, void *context)
+static void segment_complete(int read_err, unsigned long write_err,
+ void *context)
{
/* FIXME: tidy this function */
sector_t progress = 0;
diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h
index 4621ea055c0..4845f2a0c67 100644
--- a/drivers/md/kcopyd.h
+++ b/drivers/md/kcopyd.h
@@ -32,8 +32,8 @@ void kcopyd_client_destroy(struct kcopyd_client *kc);
* read_err is a boolean,
* write_err is a bitset, with 1 bit for each destination region
*/
-typedef void (*kcopyd_notify_fn)(int read_err,
- unsigned int write_err, void *context);
+typedef void (*kcopyd_notify_fn)(int read_err, unsigned long write_err,
+ void *context);
int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
unsigned int num_dests, struct io_region *dests,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7da6ec244e1..5ebfb4d7990 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1105,7 +1105,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
if (rdev->sb_size & bmask)
- rdev-> sb_size = (rdev->sb_size | bmask)+1;
+ rdev->sb_size = (rdev->sb_size | bmask) + 1;
+
+ if (minor_version
+ && rdev->data_offset < sb_offset + (rdev->sb_size/512))
+ return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
rdev->desc_nr = -1;
@@ -1137,7 +1141,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
else
ret = 0;
}
- if (minor_version)
+ if (minor_version)
rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
else
rdev->size = rdev->sb_offset;
@@ -1499,7 +1503,8 @@ static void export_rdev(mdk_rdev_t * rdev)
free_disk_sb(rdev);
list_del_init(&rdev->same_set);
#ifndef MODULE
- md_autodetect_dev(rdev->bdev->bd_dev);
+ if (test_bit(AutoDetected, &rdev->flags))
+ md_autodetect_dev(rdev->bdev->bd_dev);
#endif
unlock_rdev(rdev);
kobject_put(&rdev->kobj);
@@ -1859,17 +1864,6 @@ static struct rdev_sysfs_entry rdev_state =
__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
static ssize_t
-super_show(mdk_rdev_t *rdev, char *page)
-{
- if (rdev->sb_loaded && rdev->sb_size) {
- memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
- return rdev->sb_size;
- } else
- return 0;
-}
-static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
-
-static ssize_t
errors_show(mdk_rdev_t *rdev, char *page)
{
return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
@@ -1996,9 +1990,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
char *e;
unsigned long long size = simple_strtoull(buf, &e, 10);
unsigned long long oldsize = rdev->size;
+ mddev_t *my_mddev = rdev->mddev;
+
if (e==buf || (*e && *e != '\n'))
return -EINVAL;
- if (rdev->mddev->pers)
+ if (my_mddev->pers)
return -EBUSY;
rdev->size = size;
if (size > oldsize && rdev->mddev->external) {
@@ -2011,7 +2007,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
int overlap = 0;
struct list_head *tmp, *tmp2;
- mddev_unlock(rdev->mddev);
+ mddev_unlock(my_mddev);
for_each_mddev(mddev, tmp) {
mdk_rdev_t *rdev2;
@@ -2031,7 +2027,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
break;
}
}
- mddev_lock(rdev->mddev);
+ mddev_lock(my_mddev);
if (overlap) {
/* Someone else could have slipped in a size
* change here, but doing so is just silly.
@@ -2043,8 +2039,8 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EBUSY;
}
}
- if (size < rdev->mddev->size || rdev->mddev->size == 0)
- rdev->mddev->size = size;
+ if (size < my_mddev->size || my_mddev->size == 0)
+ my_mddev->size = size;
return len;
}
@@ -2053,7 +2049,6 @@ __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
- &rdev_super.attr,
&rdev_errors.attr,
&rdev_slot.attr,
&rdev_offset.attr,
@@ -2065,10 +2060,21 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+ mddev_t *mddev = rdev->mddev;
+ ssize_t rv;
if (!entry->show)
return -EIO;
- return entry->show(rdev, page);
+
+ rv = mddev ? mddev_lock(mddev) : -EBUSY;
+ if (!rv) {
+ if (rdev->mddev == NULL)
+ rv = -EBUSY;
+ else
+ rv = entry->show(rdev, page);
+ mddev_unlock(mddev);
+ }
+ return rv;
}
static ssize_t
@@ -2077,15 +2083,19 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
- int rv;
+ ssize_t rv;
+ mddev_t *mddev = rdev->mddev;
if (!entry->store)
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- rv = mddev_lock(rdev->mddev);
+ rv = mddev ? mddev_lock(mddev): -EBUSY;
if (!rv) {
- rv = entry->store(rdev, page, length);
+ if (rdev->mddev == NULL)
+ rv = -EBUSY;
+ else
+ rv = entry->store(rdev, page, length);
mddev_unlock(rdev->mddev);
}
return rv;
@@ -4142,7 +4152,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
return 0;
busy:
- printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
+ printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
}
@@ -5127,7 +5137,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (mddev->ro==1)
seq_printf(seq, " (read-only)");
if (mddev->ro==2)
- seq_printf(seq, "(auto-read-only)");
+ seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name);
}
@@ -5351,6 +5361,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
mddev->ro = 0;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread);
}
atomic_inc(&mddev->writes_pending);
if (mddev->in_sync) {
@@ -6021,6 +6032,7 @@ static void autostart_arrays(int part)
MD_BUG();
continue;
}
+ set_bit(AutoDetected, &rdev->flags);
list_add(&rdev->same_set, &pending_raid_disks);
i_passed++;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5c7fef091ce..ff61b309129 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -592,6 +592,37 @@ static int raid1_congested(void *data, int bits)
}
+static int flush_pending_writes(conf_t *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ * We return 1 if any requests were actually submitted.
+ */
+ int rv = 0;
+
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ blk_remove_plug(conf->mddev->queue);
+ spin_unlock_irq(&conf->device_lock);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ rv = 1;
+ } else
+ spin_unlock_irq(&conf->device_lock);
+ return rv;
+}
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -673,15 +704,23 @@ static void freeze_array(conf_t *conf)
/* stop syncio and normal IO and wait for everything to
* go quite.
* We increment barrier and nr_waiting, and then
- * wait until barrier+nr_pending match nr_queued+2
+ * wait until nr_pending match nr_queued+1
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (1)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
*/
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier,
- conf->barrier+conf->nr_pending == conf->nr_queued+2,
+ conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ ({ flush_pending_writes(conf);
+ raid1_unplug(conf->mddev->queue); }));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(conf_t *conf)
@@ -907,6 +946,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* In case raid1d snuck into freeze_array */
+ wake_up(&conf->wait_barrier);
+
if (do_sync)
md_wakeup_thread(mddev->thread);
#if 0
@@ -1473,28 +1515,14 @@ static void raid1d(mddev_t *mddev)
for (;;) {
char b[BDEVNAME_SIZE];
- spin_lock_irqsave(&conf->device_lock, flags);
-
- if (conf->pending_bio_list.head) {
- bio = bio_list_get(&conf->pending_bio_list);
- blk_remove_plug(mddev->queue);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /* flush any pending bitmap writes to disk before proceeding w/ I/O */
- bitmap_unplug(mddev->bitmap);
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- bio->bi_next = NULL;
- generic_make_request(bio);
- bio = next;
- }
- unplug = 1;
+ unplug += flush_pending_writes(conf);
- continue;
- }
-
- if (list_empty(head))
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
break;
+ }
r1_bio = list_entry(head->prev, r1bio_t, retry_list);
list_del(head->prev);
conf->nr_queued--;
@@ -1590,7 +1618,6 @@ static void raid1d(mddev_t *mddev)
}
}
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
if (unplug)
unplug_slaves(mddev);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 017f58113c3..32389d2f18f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -537,7 +537,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
current_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
- /* Find the disk whose head is closest */
+ /* Find the disk whose head is closest,
+ * or - for far > 1 - find the closest to partition beginning */
for (nslot = slot; nslot < conf->copies; nslot++) {
int ndisk = r10_bio->devs[nslot].devnum;
@@ -557,8 +558,13 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
slot = nslot;
break;
}
- new_distance = abs(r10_bio->devs[nslot].addr -
- conf->mirrors[ndisk].head_position);
+
+ /* for far > 1 always use the lowest address */
+ if (conf->far_copies > 1)
+ new_distance = r10_bio->devs[nslot].addr;
+ else
+ new_distance = abs(r10_bio->devs[nslot].addr -
+ conf->mirrors[ndisk].head_position);
if (new_distance < current_distance) {
current_distance = new_distance;
disk = ndisk;
@@ -629,7 +635,36 @@ static int raid10_congested(void *data, int bits)
return ret;
}
-
+static int flush_pending_writes(conf_t *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ * We return 1 if any requests were actually submitted.
+ */
+ int rv = 0;
+
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ blk_remove_plug(conf->mddev->queue);
+ spin_unlock_irq(&conf->device_lock);
+ /* flush any pending bitmap writes to disk
+ * before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ rv = 1;
+ } else
+ spin_unlock_irq(&conf->device_lock);
+ return rv;
+}
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -712,15 +747,23 @@ static void freeze_array(conf_t *conf)
/* stop syncio and normal IO and wait for everything to
* go quiet.
* We increment barrier and nr_waiting, and then
- * wait until barrier+nr_pending match nr_queued+2
+ * wait until nr_pending match nr_queued+1
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (1)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
*/
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier,
- conf->barrier+conf->nr_pending == conf->nr_queued+2,
+ conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ ({ flush_pending_writes(conf);
+ raid10_unplug(conf->mddev->queue); }));
spin_unlock_irq(&conf->resync_lock);
}
@@ -892,6 +935,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* In case raid10d snuck in to freeze_array */
+ wake_up(&conf->wait_barrier);
+
if (do_sync)
md_wakeup_thread(mddev->thread);
@@ -1464,28 +1510,14 @@ static void raid10d(mddev_t *mddev)
for (;;) {
char b[BDEVNAME_SIZE];
- spin_lock_irqsave(&conf->device_lock, flags);
- if (conf->pending_bio_list.head) {
- bio = bio_list_get(&conf->pending_bio_list);
- blk_remove_plug(mddev->queue);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /* flush any pending bitmap writes to disk before proceeding w/ I/O */
- bitmap_unplug(mddev->bitmap);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- bio->bi_next = NULL;
- generic_make_request(bio);
- bio = next;
- }
- unplug = 1;
-
- continue;
- }
+ unplug += flush_pending_writes(conf);
- if (list_empty(head))
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
break;
+ }
r10_bio = list_entry(head->prev, r10bio_t, retry_list);
list_del(head->prev);
conf->nr_queued--;
@@ -1548,7 +1580,6 @@ static void raid10d(mddev_t *mddev)
}
}
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
if (unplug)
unplug_slaves(mddev);
}
@@ -1787,6 +1818,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (j == conf->copies) {
/* Cannot recover, so abort the recovery */
put_buf(r10_bio);
+ if (rb2)
+ atomic_dec(&rb2->remaining);
r10_bio = rb2;
if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2d6f1a51359..b162b839a66 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1143,7 +1143,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
rdev = conf->disks[i].rdev;
printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS,
- (unsigned long long)sh->sector + rdev->data_offset,
+ (unsigned long long)(sh->sector + rdev->data_offset),
bdevname(rdev->bdev, b));
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
@@ -1160,13 +1160,13 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (conf->mddev->degraded)
printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
mdname(conf->mddev),
- (unsigned long long)sh->sector + rdev->data_offset,
+ (unsigned long long)(sh->sector + rdev->data_offset),
bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
mdname(conf->mddev),
- (unsigned long long)sh->sector + rdev->data_offset,
+ (unsigned long long)(sh->sector + rdev->data_offset),
bdn);
else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
@@ -2348,25 +2348,15 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
+ int canceled_check = 0;
+
set_bit(STRIPE_HANDLE, &sh->state);
- /* Take one of the following actions:
- * 1/ start a check parity operation if (uptodate == disks)
- * 2/ finish a check parity operation and act on the result
- * 3/ skip to the writeback section if we previously
- * initiated a recovery operation
- */
- if (s->failed == 0 &&
- !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
- if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
- BUG_ON(s->uptodate != disks);
- clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
- sh->ops.count++;
- s->uptodate--;
- } else if (
- test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
- clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
- clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+ /* complete a check operation */
+ if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
+ clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+ clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+ if (s->failed == 0) {
if (sh->ops.zero_sum_result == 0)
/* parity is correct (on disc,
* not in buffer any more)
@@ -2391,7 +2381,8 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
s->uptodate++;
}
}
- }
+ } else
+ canceled_check = 1; /* STRIPE_INSYNC is not set */
}
/* check if we can clear a parity disk reconstruct */
@@ -2404,12 +2395,28 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
}
+ /* start a new check operation if there are no failures, the stripe is
+ * not insync, and a repair is not in flight
+ */
+ if (s->failed == 0 &&
+ !test_bit(STRIPE_INSYNC, &sh->state) &&
+ !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+ if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+ BUG_ON(s->uptodate != disks);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ sh->ops.count++;
+ s->uptodate--;
+ }
+ }
+
/* Wait for check parity and compute block operations to complete
- * before write-back
+ * before write-back. If a failure occurred while the check operation
+ * was in flight we need to cycle this stripe through handle_stripe
+ * since the parity block may not be uptodate
*/
- if (!test_bit(STRIPE_INSYNC, &sh->state) &&
- !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
- !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+ if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
+ !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
+ !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
struct r5dev *dev;
/* either failed parity check, or recovery is happening */
if (s->failed == 0)