diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 27 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 168 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 23 | ||||
-rw-r--r-- | drivers/md/md.c | 23 | ||||
-rw-r--r-- | drivers/md/multipath.c | 60 | ||||
-rw-r--r-- | drivers/md/multipath.h | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 506 | ||||
-rw-r--r-- | drivers/md/raid1.h | 4 | ||||
-rw-r--r-- | drivers/md/raid10.c | 424 | ||||
-rw-r--r-- | drivers/md/raid5.c | 43 |
16 files changed, 672 insertions, 655 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5c9362792f1..70bd738b8b9 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); - if (bitmap->mddev->events < bitmap->events_cleared) { + if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - } + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + sb->state = cpu_to_le32(bitmap->flags); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); @@ -618,7 +618,7 @@ success: if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) bitmap->flags |= BITMAP_HOSTENDIAN; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (sb->state & cpu_to_le32(BITMAP_STALE)) + if (bitmap->flags & BITMAP_STALE) bitmap->events_cleared = bitmap->mddev->events; err = 0; out: @@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, switch (op) { case MASK_SET: sb->state |= cpu_to_le32(bits); + bitmap->flags |= bits; break; case MASK_UNSET: sb->state &= cpu_to_le32(~bits); + bitmap->flags &= ~bits; break; default: BUG(); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 76a5af00a26..2067288f61f 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -19,6 +19,8 @@ #define DM_MSG_PREFIX "io" #define DM_IO_MAX_REGIONS BITS_PER_LONG +#define MIN_IOS 16 +#define MIN_BIOS 16 struct dm_io_client { mempool_t *pool; @@ -41,33 +43,21 @@ struct io { static struct kmem_cache *_dm_io_cache; /* - * io contexts are only dynamically allocated for asynchronous - * io. Since async io is likely to be the majority of io we'll - * have the same number of io contexts as bios! (FIXME: must reduce this). - */ - -static unsigned int pages_to_ios(unsigned int pages) -{ - return 4 * pages; /* too many ? */ -} - -/* * Create a client with mempool and bioset. */ -struct dm_io_client *dm_io_client_create(unsigned num_pages) +struct dm_io_client *dm_io_client_create(void) { - unsigned ios = pages_to_ios(num_pages); struct dm_io_client *client; client = kmalloc(sizeof(*client), GFP_KERNEL); if (!client) return ERR_PTR(-ENOMEM); - client->pool = mempool_create_slab_pool(ios, _dm_io_cache); + client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache); if (!client->pool) goto bad; - client->bios = bioset_create(16, 0); + client->bios = bioset_create(MIN_BIOS, 0); if (!client->bios) goto bad; @@ -81,13 +71,6 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) } EXPORT_SYMBOL(dm_io_client_create); -int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client) -{ - return mempool_resize(client->pool, pages_to_ios(num_pages), - GFP_KERNEL); -} -EXPORT_SYMBOL(dm_io_client_resize); - void dm_io_client_destroy(struct dm_io_client *client) { mempool_destroy(client->pool); diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 1bb73a13ca4..819e37eaaeb 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -27,15 +27,19 @@ #include "dm.h" +#define SUB_JOB_SIZE 128 +#define SPLIT_COUNT 8 +#define MIN_JOBS 8 +#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE)) + /*----------------------------------------------------------------- * Each kcopyd client has its own little pool of preallocated * pages for kcopyd io. *---------------------------------------------------------------*/ struct dm_kcopyd_client { - spinlock_t lock; struct page_list *pages; - unsigned int nr_pages; - unsigned int nr_free_pages; + unsigned nr_reserved_pages; + unsigned nr_free_pages; struct dm_io_client *io_client; @@ -67,15 +71,18 @@ static void wake(struct dm_kcopyd_client *kc) queue_work(kc->kcopyd_wq, &kc->kcopyd_work); } -static struct page_list *alloc_pl(void) +/* + * Obtain one page for the use of kcopyd. + */ +static struct page_list *alloc_pl(gfp_t gfp) { struct page_list *pl; - pl = kmalloc(sizeof(*pl), GFP_KERNEL); + pl = kmalloc(sizeof(*pl), gfp); if (!pl) return NULL; - pl->page = alloc_page(GFP_KERNEL); + pl->page = alloc_page(gfp); if (!pl->page) { kfree(pl); return NULL; @@ -90,41 +97,56 @@ static void free_pl(struct page_list *pl) kfree(pl); } -static int kcopyd_get_pages(struct dm_kcopyd_client *kc, - unsigned int nr, struct page_list **pages) +/* + * Add the provided pages to a client's free page list, releasing + * back to the system any beyond the reserved_pages limit. + */ +static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) { - struct page_list *pl; - - spin_lock(&kc->lock); - if (kc->nr_free_pages < nr) { - spin_unlock(&kc->lock); - return -ENOMEM; - } - - kc->nr_free_pages -= nr; - for (*pages = pl = kc->pages; --nr; pl = pl->next) - ; + struct page_list *next; - kc->pages = pl->next; - pl->next = NULL; + do { + next = pl->next; - spin_unlock(&kc->lock); + if (kc->nr_free_pages >= kc->nr_reserved_pages) + free_pl(pl); + else { + pl->next = kc->pages; + kc->pages = pl; + kc->nr_free_pages++; + } - return 0; + pl = next; + } while (pl); } -static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) +static int kcopyd_get_pages(struct dm_kcopyd_client *kc, + unsigned int nr, struct page_list **pages) { - struct page_list *cursor; + struct page_list *pl; + + *pages = NULL; + + do { + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + if (unlikely(!pl)) { + /* Use reserved pages */ + pl = kc->pages; + if (unlikely(!pl)) + goto out_of_memory; + kc->pages = pl->next; + kc->nr_free_pages--; + } + pl->next = *pages; + *pages = pl; + } while (--nr); - spin_lock(&kc->lock); - for (cursor = pl; cursor->next; cursor = cursor->next) - kc->nr_free_pages++; + return 0; - kc->nr_free_pages++; - cursor->next = kc->pages; - kc->pages = pl; - spin_unlock(&kc->lock); +out_of_memory: + if (*pages) + kcopyd_put_pages(kc, *pages); + return -ENOMEM; } /* @@ -141,13 +163,16 @@ static void drop_pages(struct page_list *pl) } } -static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) +/* + * Allocate and reserve nr_pages for the use of a specific client. + */ +static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages) { - unsigned int i; + unsigned i; struct page_list *pl = NULL, *next; - for (i = 0; i < nr; i++) { - next = alloc_pl(); + for (i = 0; i < nr_pages; i++) { + next = alloc_pl(GFP_KERNEL); if (!next) { if (pl) drop_pages(pl); @@ -157,17 +182,18 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) pl = next; } + kc->nr_reserved_pages += nr_pages; kcopyd_put_pages(kc, pl); - kc->nr_pages += nr; + return 0; } static void client_free_pages(struct dm_kcopyd_client *kc) { - BUG_ON(kc->nr_free_pages != kc->nr_pages); + BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); drop_pages(kc->pages); kc->pages = NULL; - kc->nr_free_pages = kc->nr_pages = 0; + kc->nr_free_pages = kc->nr_reserved_pages = 0; } /*----------------------------------------------------------------- @@ -216,16 +242,17 @@ struct kcopyd_job { struct mutex lock; atomic_t sub_jobs; sector_t progress; -}; -/* FIXME: this should scale with the number of pages */ -#define MIN_JOBS 512 + struct kcopyd_job *master_job; +}; static struct kmem_cache *_job_cache; int __init dm_kcopyd_init(void) { - _job_cache = KMEM_CACHE(kcopyd_job, 0); + _job_cache = kmem_cache_create("kcopyd_job", + sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), + __alignof__(struct kcopyd_job), 0, NULL); if (!_job_cache) return -ENOMEM; @@ -299,7 +326,12 @@ static int run_complete_job(struct kcopyd_job *job) if (job->pages) kcopyd_put_pages(kc, job->pages); - mempool_free(job, kc->job_pool); + /* + * If this is the master job, the sub jobs have already + * completed so we can free everything. + */ + if (job->master_job == job) + mempool_free(job, kc->job_pool); fn(read_err, write_err, context); if (atomic_dec_and_test(&kc->nr_jobs)) @@ -460,14 +492,14 @@ static void dispatch_job(struct kcopyd_job *job) wake(kc); } -#define SUB_JOB_SIZE 128 static void segment_complete(int read_err, unsigned long write_err, void *context) { /* FIXME: tidy this function */ sector_t progress = 0; sector_t count = 0; - struct kcopyd_job *job = (struct kcopyd_job *) context; + struct kcopyd_job *sub_job = (struct kcopyd_job *) context; + struct kcopyd_job *job = sub_job->master_job; struct dm_kcopyd_client *kc = job->kc; mutex_lock(&job->lock); @@ -498,8 +530,6 @@ static void segment_complete(int read_err, unsigned long write_err, if (count) { int i; - struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool, - GFP_NOIO); *sub_job = *job; sub_job->source.sector += progress; @@ -511,7 +541,7 @@ static void segment_complete(int read_err, unsigned long write_err, } sub_job->fn = segment_complete; - sub_job->context = job; + sub_job->context = sub_job; dispatch_job(sub_job); } else if (atomic_dec_and_test(&job->sub_jobs)) { @@ -531,19 +561,19 @@ static void segment_complete(int read_err, unsigned long write_err, } /* - * Create some little jobs that will do the move between - * them. + * Create some sub jobs to share the work between them. */ -#define SPLIT_COUNT 8 -static void split_job(struct kcopyd_job *job) +static void split_job(struct kcopyd_job *master_job) { int i; - atomic_inc(&job->kc->nr_jobs); + atomic_inc(&master_job->kc->nr_jobs); - atomic_set(&job->sub_jobs, SPLIT_COUNT); - for (i = 0; i < SPLIT_COUNT; i++) - segment_complete(0, 0u, job); + atomic_set(&master_job->sub_jobs, SPLIT_COUNT); + for (i = 0; i < SPLIT_COUNT; i++) { + master_job[i + 1].master_job = master_job; + segment_complete(0, 0u, &master_job[i + 1]); + } } int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, @@ -553,7 +583,8 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, struct kcopyd_job *job; /* - * Allocate a new job. + * Allocate an array of jobs consisting of one master job + * followed by SPLIT_COUNT sub jobs. */ job = mempool_alloc(kc->job_pool, GFP_NOIO); @@ -577,10 +608,10 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->fn = fn; job->context = context; + job->master_job = job; - if (job->source.count < SUB_JOB_SIZE) + if (job->source.count <= SUB_JOB_SIZE) dispatch_job(job); - else { mutex_init(&job->lock); job->progress = 0; @@ -606,17 +637,15 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) /*----------------------------------------------------------------- * Client setup *---------------------------------------------------------------*/ -int dm_kcopyd_client_create(unsigned int nr_pages, - struct dm_kcopyd_client **result) +struct dm_kcopyd_client *dm_kcopyd_client_create(void) { int r = -ENOMEM; struct dm_kcopyd_client *kc; kc = kmalloc(sizeof(*kc), GFP_KERNEL); if (!kc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - spin_lock_init(&kc->lock); spin_lock_init(&kc->job_lock); INIT_LIST_HEAD(&kc->complete_jobs); INIT_LIST_HEAD(&kc->io_jobs); @@ -633,12 +662,12 @@ int dm_kcopyd_client_create(unsigned int nr_pages, goto bad_workqueue; kc->pages = NULL; - kc->nr_pages = kc->nr_free_pages = 0; - r = client_alloc_pages(kc, nr_pages); + kc->nr_reserved_pages = kc->nr_free_pages = 0; + r = client_reserve_pages(kc, RESERVE_PAGES); if (r) goto bad_client_pages; - kc->io_client = dm_io_client_create(nr_pages); + kc->io_client = dm_io_client_create(); if (IS_ERR(kc->io_client)) { r = PTR_ERR(kc->io_client); goto bad_io_client; @@ -647,8 +676,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages, init_waitqueue_head(&kc->destroyq); atomic_set(&kc->nr_jobs, 0); - *result = kc; - return 0; + return kc; bad_io_client: client_free_pages(kc); @@ -659,7 +687,7 @@ bad_workqueue: bad_slab: kfree(kc); - return r; + return ERR_PTR(r); } EXPORT_SYMBOL(dm_kcopyd_client_create); diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index a1f32188967..948e3f4925b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -449,8 +449,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, lc->io_req.mem.type = DM_IO_VMA; lc->io_req.notify.fn = NULL; - lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, - PAGE_SIZE)); + lc->io_req.client = dm_io_client_create(); if (IS_ERR(lc->io_req.client)) { r = PTR_ERR(lc->io_req.client); DMWARN("couldn't allocate disk io client"); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index a550a057d99..aa4e570c2cb 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1290,7 +1290,7 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (error == -EOPNOTSUPP || error == -EREMOTEIO) + if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) return error; if (mpio->pgpath) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 976ad4688af..9bfd057be68 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -22,8 +22,6 @@ #define DM_MSG_PREFIX "raid1" #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ -#define DM_IO_PAGES 64 -#define DM_KCOPYD_PAGES 64 #define DM_RAID1_HANDLE_ERRORS 0x01 #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) @@ -887,7 +885,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, return NULL; } - ms->io_client = dm_io_client_create(DM_IO_PAGES); + ms->io_client = dm_io_client_create(); if (IS_ERR(ms->io_client)) { ti->error = "Error creating dm_io client"; mempool_destroy(ms->read_record_pool); @@ -1117,9 +1115,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_destroy_wq; } - r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); - if (r) + ms->kcopyd_client = dm_kcopyd_client_create(); + if (IS_ERR(ms->kcopyd_client)) { + r = PTR_ERR(ms->kcopyd_client); goto err_destroy_wq; + } wakeup_mirrord(ms); return 0; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 95891dfcbca..135c2f1fdbf 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -154,11 +154,6 @@ struct pstore { struct workqueue_struct *metadata_wq; }; -static unsigned sectors_to_pages(unsigned sectors) -{ - return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); -} - static int alloc_area(struct pstore *ps) { int r = -ENOMEM; @@ -318,8 +313,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) chunk_size_supplied = 0; } - ps->io_client = dm_io_client_create(sectors_to_pages(ps->store-> - chunk_size)); + ps->io_client = dm_io_client_create(); if (IS_ERR(ps->io_client)) return PTR_ERR(ps->io_client); @@ -368,11 +362,6 @@ static int read_header(struct pstore *ps, int *new_snapshot) return r; } - r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), - ps->io_client); - if (r) - return r; - r = alloc_area(ps); return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a2d330942cb..9ecff5f3023 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -40,11 +40,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; #define SNAPSHOT_COPY_PRIORITY 2 /* - * Reserve 1MB for each snapshot initially (with minimum of 1 page). - */ -#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) - -/* * The size of the mempool used to track chunks in use. */ #define MIN_IOS 256 @@ -1116,8 +1111,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } - r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); - if (r) { + s->kcopyd_client = dm_kcopyd_client_create(); + if (IS_ERR(s->kcopyd_client)) { + r = PTR_ERR(s->kcopyd_client); ti->error = "Could not create kcopyd client"; goto bad_kcopyd; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cb8380c9767..451c3bb176d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -362,6 +362,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { + struct request_queue *q; struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; sector_t dev_size = @@ -370,6 +371,22 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, limits->logical_block_size >> SECTOR_SHIFT; char b[BDEVNAME_SIZE]; + /* + * Some devices exist without request functions, + * such as loop devices not yet bound to backing files. + * Forbid the use of such devices. + */ + q = bdev_get_queue(bdev); + if (!q || !q->make_request_fn) { + DMWARN("%s: %s is not yet initialised: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); + return 1; + } + if (!dev_size) return 0; @@ -1346,7 +1363,8 @@ bool dm_table_supports_discards(struct dm_table *t) return 0; /* - * Ensure that at least one underlying device supports discards. + * Unless any target used by the table set discards_supported, + * require at least one underlying device to support discards. * t->devices includes internal dm devices such as mirror logs * so we need to use iterate_devices here, which targets * supporting discard must provide. @@ -1354,6 +1372,9 @@ bool dm_table_supports_discards(struct dm_table *t) while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); + if (ti->discards_supported) + return 1; + if (ti->type->iterate_devices && ti->type->iterate_devices(ti, device_discard_capable, NULL)) return 1; diff --git a/drivers/md/md.c b/drivers/md/md.c index 7d6f7f18a92..aa640a85bb2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long long n = simple_strtoull(buf, &e, 10); - if (mddev->pers) + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return -EBUSY; if (cmd_match(buf, "none")) n = MaxSector; @@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. */ disk->flags |= GENHD_FL_EXT_DEVT; - add_disk(disk); mddev->gendisk = disk; + /* As soon as we call add_disk(), another thread could get + * through to md_open, so make sure it doesn't get too far + */ + mutex_lock(&mddev->open_mutex); + add_disk(disk); + error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { @@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); - - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { @@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) } else super_types[mddev->major_version]. validate_super(mddev, rdev); + if ((info->state & (1<<MD_DISK_SYNC)) && + (!test_bit(In_sync, &rdev->flags) || + rdev->raid_disk != info->raid_disk)) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } + if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = rdev->raid_disk; else diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c3589099098..3535c23af28 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->multipaths[i].rdev && @@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits) static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) { multipath_conf_t *conf = mddev->private; + char b[BDEVNAME_SIZE]; - if (conf->working_disks <= 1) { + if (conf->raid_disks - mddev->degraded <= 1) { /* * Uh oh, we can do nothing if this is our last path, but * first check if this is a queued request for a device * which has just failed. */ printk(KERN_ALERT - "multipath: only one IO path left and IO error.\n"); + "multipath: only one IO path left and IO error.\n"); /* leave it active... it's all we have */ - } else { - /* - * Mark disk as unusable - */ - if (!test_bit(Faulty, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - clear_bit(In_sync, &rdev->flags); - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - conf->working_disks--; - mddev->degraded++; - printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path.\n" - "multipath: Operation continuing" - " on %d IO paths.\n", - bdevname (rdev->bdev,b), - conf->working_disks); - } + return; + } + /* + * Mark disk as unusable + */ + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); } + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "multipath: IO failure on %s," + " disabling IO path.\n" + "multipath: Operation continuing" + " on %d IO paths.\n", + bdevname(rdev->bdev, b), + conf->raid_disks - mddev->degraded); } static void print_multipath_conf (multipath_conf_t *conf) @@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf) printk("(conf==NULL)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { @@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) PAGE_CACHE_SIZE - 1); } - conf->working_disks++; + spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); + spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; md_integrity_add_rdev(rdev, mddev); @@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev) int disk_idx; struct multipath_info *disk; mdk_rdev_t *rdev; + int working_disks; if (md_check_no_bitmap(mddev)) return -EINVAL; @@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev) goto out_free_conf; } - conf->working_disks = 0; + working_disks = 0; list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || @@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev) } if (!test_bit(Faulty, &rdev->flags)) - conf->working_disks++; + working_disks++; } conf->raid_disks = mddev->raid_disks; @@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev) spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (!conf->working_disks) { + if (!working_disks) { printk(KERN_ERR "multipath: no operational IO paths for %s\n", mdname(mddev)); goto out_free_conf; } - mddev->degraded = conf->raid_disks - conf->working_disks; + mddev->degraded = conf->raid_disks - working_disks; conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); @@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev) printk(KERN_INFO "multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->working_disks, mddev->raid_disks); + mdname(mddev), conf->raid_disks - mddev->degraded, + mddev->raid_disks); /* * Ok, everything is just fine now */ diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index d1c2a8d7839..3c5a45eb5f8 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h @@ -9,7 +9,6 @@ struct multipath_private_data { mddev_t *mddev; struct multipath_info *multipaths; int raid_disks; - int working_disks; spinlock_t device_lock; struct list_head retry_list; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2b7a7ff401d..5d096096f95 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error) rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } -static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, - int behind) +static void r1_bio_write_done(r1bio_t *r1_bio) { if (atomic_dec_and_test(&r1_bio->remaining)) { /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { /* free extra copy of the data pages */ - int i = vcnt; + int i = r1_bio->behind_page_count; while (i--) - safe_put_page(bv[i].bv_page); + safe_put_page(r1_bio->behind_pages[i]); + kfree(r1_bio->behind_pages); + r1_bio->behind_pages = NULL; } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); + test_bit(R1BIO_BehindIO, &r1_bio->state)); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error) * Let's see if all mirrored write operations have finished * already. */ - r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); + r1_bio_write_done(r1_bio); if (to_put) bio_put(to_put); @@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const sector_t this_sector = r1_bio->sector; const int sectors = r1_bio->sectors; - int new_disk = -1; int start_disk; + int best_disk; int i; - sector_t new_distance, current_distance; + sector_t best_dist; mdk_rdev_t *rdev; int choose_first; @@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) * We take the first readable disk when above the resync window. */ retry: + best_disk = -1; + best_dist = MaxSector; if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { choose_first = 1; @@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) start_disk = conf->last_used; } - /* make sure the disk is operational */ for (i = 0 ; i < conf->raid_disks ; i++) { + sector_t dist; int disk = start_disk + i; if (disk >= conf->raid_disks) disk -= conf->raid_disks; @@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL - || !test_bit(In_sync, &rdev->flags)) + || test_bit(Faulty, &rdev->flags)) continue; - - new_disk = disk; - if (!test_bit(WriteMostly, &rdev->flags)) - break; - } - - if (new_disk < 0 || choose_first) - goto rb_out; - - /* - * Don't change to another disk for sequential reads: - */ - if (conf->next_seq_sect == this_sector) - goto rb_out; - if (this_sector == conf->mirrors[new_disk].head_position) - goto rb_out; - - current_distance = abs(this_sector - - conf->mirrors[new_disk].head_position); - - /* look for a better disk - i.e. head is closer */ - start_disk = new_disk; - for (i = 1; i < conf->raid_disks; i++) { - int disk = start_disk + 1; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; - - rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || !test_bit(In_sync, &rdev->flags) - || test_bit(WriteMostly, &rdev->flags)) + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < this_sector + sectors) continue; - - if (!atomic_read(&rdev->nr_pending)) { - new_disk = disk; + if (test_bit(WriteMostly, &rdev->flags)) { + /* Don't balance among write-mostly, just + * use the first as a last resort */ + if (best_disk < 0) + best_disk = disk; + continue; + } + /* This is a reasonable device to use. It might + * even be best. + */ + dist = abs(this_sector - conf->mirrors[disk].head_position); + if (choose_first + /* Don't change to another disk for sequential reads */ + || conf->next_seq_sect == this_sector + || dist == 0 + /* If device is idle, use it */ + || atomic_read(&rdev->nr_pending) == 0) { + best_disk = disk; break; } - new_distance = abs(this_sector - conf->mirrors[disk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - new_disk = disk; + if (dist < best_dist) { + best_dist = dist; + best_disk = disk; } } - rb_out: - if (new_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[new_disk].rdev); + if (best_disk >= 0) { + rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (!test_bit(In_sync, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags)) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ @@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) goto retry; } conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; + conf->last_used = best_disk; } rcu_read_unlock(); - return new_disk; + return best_disk; } static int raid1_congested(void *data, int bits) @@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf) /* duplicate the data pages for behind I/O - * We return a list of bio_vec rather than just page pointers - * as it makes freeing easier */ -static struct bio_vec *alloc_behind_pages(struct bio *bio) +static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) { int i; struct bio_vec *bvec; - struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), + struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), GFP_NOIO); if (unlikely(!pages)) - goto do_sync_io; + return; bio_for_each_segment(bvec, bio, i) { - pages[i].bv_page = alloc_page(GFP_NOIO); - if (unlikely(!pages[i].bv_page)) + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) goto do_sync_io; - memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, + memcpy(kmap(pages[i]) + bvec->bv_offset, kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(pages[i].bv_page); + kunmap(pages[i]); kunmap(bvec->bv_page); } - - return pages; + r1_bio->behind_pages = pages; + r1_bio->behind_page_count = bio->bi_vcnt; + set_bit(R1BIO_BehindIO, &r1_bio->state); + return; do_sync_io: - if (pages) - for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) - put_page(pages[i].bv_page); + for (i = 0; i < bio->bi_vcnt; i++) + if (pages[i]) + put_page(pages[i]); kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); - return NULL; } static int make_request(mddev_t *mddev, struct bio * bio) @@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) int i, targets = 0, disks; struct bitmap *bitmap; unsigned long flags; - struct bio_vec *behind_pages = NULL; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); @@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait) && - (behind_pages = alloc_behind_pages(bio)) != NULL) - set_bit(R1BIO_BehindIO, &r1_bio->state); + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(bio, r1_bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; - if (behind_pages) { + if (r1_bio->behind_pages) { struct bio_vec *bvec; int j; @@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) * them all */ __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = behind_pages[j].bv_page; + bvec->bv_page = r1_bio->behind_pages[j]; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } @@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); } - r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); - kfree(behind_pages); /* the behind pages are attached to the bios now */ + r1_bio_write_done(r1_bio); /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); @@ -1196,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error) } } -static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +static int fix_sync_read_error(r1bio_t *r1_bio) { + /* Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. + */ + mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; - int i; - int disks = conf->raid_disks; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; + struct bio *bio = r1_bio->bios[r1_bio->read_disk]; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + mdk_rdev_t *rdev; + int start; - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* We have read all readable devices. If we haven't - * got the block, then there is no hope left. - * If we have, then we want to do a comparison - * and skip the write if everything is the same. - * If any blocks failed to read, then we need to - * attempt an over-write - */ - int primary; - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) - md_error(mddev, conf->mirrors[i].rdev); + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false)) { + success = 1; + break; + } + } + d++; + if (d == conf->raid_disks) + d = 0; + } while (!success && d != r1_bio->read_disk); - md_done_sync(mddev, r1_bio->sectors, 1); + if (!success) { + char b[BDEVNAME_SIZE]; + /* Cannot read from anywhere, array is toast */ + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" + " for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev, b), + (unsigned long long)r1_bio->sector); + md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); - return; + return 0; } - for (primary=0; primary<mddev->raid_disks; primary++) - if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { - r1_bio->bios[primary]->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[primary].rdev, mddev); - break; - } - r1_bio->read_disk = primary; - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) { - int j; - int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); - struct bio *pbio = r1_bio->bios[primary]; - struct bio *sbio = r1_bio->bios[i]; - - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { - for (j = vcnt; j-- ; ) { - struct page *p, *s; - p = pbio->bi_io_vec[j].bv_page; - s = sbio->bi_io_vec[j].bv_page; - if (memcmp(page_address(p), - page_address(s), - PAGE_SIZE)) - break; - } - } else - j = 0; - if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { - sbio->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[i].rdev, mddev); - } else { - /* fixup the bio for reuse */ - int size; - sbio->bi_vcnt = vcnt; - sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; - sbio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - size = sbio->bi_size; - for (j = 0; j < vcnt ; j++) { - struct bio_vec *bi; - bi = &sbio->bi_io_vec[j]; - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } - } - } + start = d; + /* write it back and re-read */ + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + WRITE, false) == 0) { + r1_bio->bios[d]->bi_end_io = NULL; + rdev_dec_pending(rdev, mddev); + md_error(mddev, rdev); + } else + atomic_add(s, &rdev->corrected_errors); + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false) == 0) + md_error(mddev, rdev); + } + sectors -= s; + sect += s; + idx ++; } - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* ouch - failed to read all of that. - * Try some synchronous reads of other devices to get - * good data, much like with normal read errors. Only - * read into the pages we already have so we don't - * need to re-issue the read request. - * We don't need to freeze the array, because being in an - * active sync request, there is no normal IO, and - * no overlapping syncs. - */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - int idx = 0; - - while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - mdk_rdev_t *rdev; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - do { - if (r1_bio->bios[d]->bi_end_io == end_sync_read) { - /* No rcu protection needed here devices - * can only be removed when no resync is - * active, and resync is currently active - */ - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false)) { - success = 1; - break; - } - } - d++; - if (d == conf->raid_disks) - d = 0; - } while (!success && d != r1_bio->read_disk); - - if (success) { - int start = d; - /* write it back and re-read */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - WRITE, false) == 0) - md_error(mddev, rdev); - } - d = start; - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false) == 0) - md_error(mddev, rdev); - } - } else { - char b[BDEVNAME_SIZE]; - /* Cannot read from anywhere, array is toast */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" - " for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), - (unsigned long long)r1_bio->sector); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return; + set_bit(R1BIO_Uptodate, &r1_bio->state); + set_bit(BIO_UPTODATE, &bio->bi_flags); + return 1; +} + +static int process_checks(r1bio_t *r1_bio) +{ + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + mddev_t *mddev = r1_bio->mddev; + conf_t *conf = mddev->private; + int primary; + int i; + + for (primary = 0; primary < conf->raid_disks; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + for (i = 0; i < conf->raid_disks; i++) { + int j; + int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + int size; + + if (r1_bio->bios[i]->bi_end_io != end_sync_read) + continue; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + PAGE_SIZE)) + break; } - sectors -= s; - sect += s; - idx ++; + } else + j = 0; + if (j >= 0) + mddev->resync_mismatches += r1_bio->sectors; + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { + /* No need to write to this device. */ + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + continue; + } + /* fixup the bio for reuse */ + sbio->bi_vcnt = vcnt; + sbio->bi_size = r1_bio->sectors << 9; + sbio->bi_idx = 0; + sbio->bi_phys_segments = 0; + sbio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bi_next = NULL; + sbio->bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), + page_address(pbio->bi_io_vec[j].bv_page), + PAGE_SIZE); } } + return 0; +} +static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +{ + conf_t *conf = mddev->private; + int i; + int disks = conf->raid_disks; + struct bio *bio, *wbio; + + bio = r1_bio->bios[r1_bio->read_disk]; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + /* ouch - failed to read all of that. */ + if (!fix_sync_read_error(r1_bio)) + return; + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (process_checks(r1_bio) < 0) + return; /* * schedule writes */ @@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && - mddev->recovery_cp == MaxSector) { + mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index cbfdf1a6acd..5fc4ca1af86 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -94,7 +94,9 @@ struct r1bio_s { int read_disk; struct list_head retry_list; - struct bitmap_update *bitmap_update; + /* Next two are only valid when R1BIO_BehindIO is set */ + struct page **behind_pages; + int behind_page_count; /* * if the IO is in WRITE direction, then multiple bios are used. * We choose the number when they are allocated. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8e9462626ec..6e846688962 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) */ set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } else { /* - * oops, read error: + * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) @@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } - - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } static void raid10_end_write_request(struct bio *bio, int error) @@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, static int read_balance(conf_t *conf, r10bio_t *r10_bio) { const sector_t this_sector = r10_bio->sector; - int disk, slot, nslot; + int disk, slot; const int sectors = r10_bio->sectors; - sector_t new_distance, current_distance; + sector_t new_distance, best_dist; mdk_rdev_t *rdev; + int do_balance; + int best_slot; raid10_find_phys(conf, r10_bio); rcu_read_lock(); +retry: + best_slot = -1; + best_dist = MaxSector; + do_balance = 1; /* * Check if we can balance. We can balance on the whole * device if no resync is going on (recovery is ok), or below @@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) * above the resync window. */ if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) { - /* make sure that disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; - - while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot++; - if (slot == conf->copies) { - slot = 0; - disk = -1; - break; - } - disk = r10_bio->devs[slot].devnum; - } - goto rb_out; - } - + && (this_sector + sectors >= conf->next_resync)) + do_balance = 0; - /* make sure the disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; - while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot ++; - if (slot == conf->copies) { - disk = -1; - goto rb_out; - } + for (slot = 0; slot < conf->copies ; slot++) { + if (r10_bio->devs[slot].bio == IO_BLOCKED) + continue; disk = r10_bio->devs[slot].devnum; - } - - - current_distance = abs(r10_bio->devs[slot].addr - - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest, - * or - for far > 1 - find the closest to partition beginning */ - - for (nslot = slot; nslot < conf->copies; nslot++) { - int ndisk = r10_bio->devs[nslot].devnum; - - - if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || - r10_bio->devs[nslot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (rdev == NULL) continue; + if (!test_bit(In_sync, &rdev->flags)) + continue; + + if (!do_balance) + break; /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { - disk = ndisk; - slot = nslot; + if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) break; - } /* for far > 1 always use the lowest address */ if (conf->far_copies > 1) - new_distance = r10_bio->devs[nslot].addr; + new_distance = r10_bio->devs[slot].addr; else - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - disk = ndisk; - slot = nslot; + new_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); + if (new_distance < best_dist) { + best_dist = new_distance; + best_slot = slot; } } + if (slot == conf->copies) + slot = best_slot; -rb_out: - r10_bio->read_slot = slot; -/* conf->next_seq_sect = this_sector + sectors;*/ - - if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) - atomic_inc(&conf->mirrors[disk].rdev->nr_pending); - else + if (slot >= 0) { + disk = r10_bio->devs[slot].devnum; + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { + /* Cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + rdev_dec_pending(rdev, conf->mddev); + goto retry; + } + r10_bio->read_slot = slot; + } else disk = -1; rcu_read_unlock(); @@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[r10_bio->read_slot].devnum; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev) { /* If rdev is not NULL */ - char b[BDEVNAME_SIZE]; - int cur_read_error_count = 0; + /* still own a reference to this rdev, so it cannot + * have been cleared recently. + */ + rdev = conf->mirrors[d].rdev; - bdevname(rdev->bdev, b); + if (test_bit(Faulty, &rdev->flags)) + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; - if (test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); - /* drive has already been failed, just ignore any - more fix_read_error() attempts */ - return; - } + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + if (atomic_read(&rdev->read_errors) > max_read_errors) { + char b[BDEVNAME_SIZE]; + bdevname(rdev->bdev, b); - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - cur_read_error_count = atomic_read(&rdev->read_errors); - if (cur_read_error_count > max_read_errors) { - rcu_read_unlock(); - printk(KERN_NOTICE - "md/raid10:%s: %s: Raid device exceeded " - "read_error threshold " - "[cur %d:max %d]\n", - mdname(mddev), - b, cur_read_error_count, max_read_errors); - printk(KERN_NOTICE - "md/raid10:%s: %s: Failing raid " - "device\n", mdname(mddev), b); - md_error(mddev, conf->mirrors[d].rdev); - return; - } + printk(KERN_NOTICE + "md/raid10:%s: %s: Raid device exceeded " + "read_error threshold [cur %d:max %d]\n", + mdname(mddev), b, + atomic_read(&rdev->read_errors), max_read_errors); + printk(KERN_NOTICE + "md/raid10:%s: %s: Failing raid device\n", + mdname(mddev), b); + md_error(mddev, conf->mirrors[d].rdev); + return; } - rcu_read_unlock(); while(sectors) { int s = sectors; @@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "write failed" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", @@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "corrected sectors" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", mdname(mddev), @@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "md/raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); } @@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev) else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); else { - int mirror; + int slot = r10_bio->read_slot; + int mirror = r10_bio->devs[slot].devnum; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. * We freeze all other IO, and try reading the block from @@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev) fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } + rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); - bio = r10_bio->devs[r10_bio->read_slot].bio; - r10_bio->devs[r10_bio->read_slot].bio = + bio = r10_bio->devs[slot].bio; + r10_bio->devs[slot].bio = mddev->ro ? IO_BLOCKED : NULL; mirror = read_balance(conf, r10_bio); if (mirror == -1) { @@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev) } else { const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); bio_put(bio); + slot = r10_bio->read_slot; rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" @@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev) (unsigned long long)r10_bio->sector); bio = bio_clone_mddev(r10_bio->master_bio, GFP_NOIO, mddev); - r10_bio->devs[r10_bio->read_slot].bio = bio; - bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + r10_bio->devs[slot].bio = bio; + bio->bi_sector = r10_bio->devs[slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_rw = READ | do_sync; @@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf) * */ -static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, + int *skipped, int go_faster) { conf_t *conf = mddev->private; r10bio_t *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; - int disk; int i; int max_sync; sector_t sync_blocks; @@ -1858,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int j, k; r10_bio = NULL; - for (i=0 ; i<conf->raid_disks; i++) - if (conf->mirrors[i].rdev && - !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { - int still_degraded = 0; - /* want to reconstruct this device */ - r10bio_t *rb2 = r10_bio; - sector_t sect = raid10_find_virt(conf, sector_nr, i); - int must_sync; - /* Unless we are doing a full sync, we only need - * to recover the block if it is set in the bitmap - */ - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); - if (sync_blocks < max_sync) - max_sync = sync_blocks; - if (!must_sync && - !conf->fullsync) { - /* yep, skip the sync_blocks here, but don't assume - * that there will never be anything to do here - */ - chunks_skipped = -1; - continue; - } + for (i=0 ; i<conf->raid_disks; i++) { + int still_degraded; + r10bio_t *rb2; + sector_t sect; + int must_sync; - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - raise_barrier(conf, rb2 != NULL); - atomic_set(&r10_bio->remaining, 0); + if (conf->mirrors[i].rdev == NULL || + test_bit(In_sync, &conf->mirrors[i].rdev->flags)) + continue; - r10_bio->master_bio = (struct bio*)rb2; - if (rb2) - atomic_inc(&rb2->remaining); - r10_bio->mddev = mddev; - set_bit(R10BIO_IsRecover, &r10_bio->state); - r10_bio->sector = sect; + still_degraded = 0; + /* want to reconstruct this device */ + rb2 = r10_bio; + sect = raid10_find_virt(conf, sector_nr, i); + /* Unless we are doing a full sync, we only need + * to recover the block if it is set in the bitmap + */ + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); + if (sync_blocks < max_sync) + max_sync = sync_blocks; + if (!must_sync && + !conf->fullsync) { + /* yep, skip the sync_blocks here, but don't assume + * that there will never be anything to do here + */ + chunks_skipped = -1; + continue; + } - raid10_find_phys(conf, r10_bio); + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + raise_barrier(conf, rb2 != NULL); + atomic_set(&r10_bio->remaining, 0); - /* Need to check if the array will still be - * degraded - */ - for (j=0; j<conf->raid_disks; j++) - if (conf->mirrors[j].rdev == NULL || - test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { - still_degraded = 1; - break; - } - - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); - - for (j=0; j<conf->copies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev && - test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { - /* This is where we read from */ - bio = r10_bio->devs[0].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_read; - bio->bi_rw = READ; - bio->bi_sector = r10_bio->devs[j].addr + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - /* and we write to 'i' */ - - for (k=0; k<conf->copies; k++) - if (r10_bio->devs[k].devnum == i) - break; - BUG_ON(k == conf->copies); - bio = r10_bio->devs[1].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; - bio->bi_sector = r10_bio->devs[k].addr + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - - r10_bio->devs[0].devnum = d; - r10_bio->devs[1].devnum = i; + r10_bio->master_bio = (struct bio*)rb2; + if (rb2) + atomic_inc(&rb2->remaining); + r10_bio->mddev = mddev; + set_bit(R10BIO_IsRecover, &r10_bio->state); + r10_bio->sector = sect; - break; - } - } - if (j == conf->copies) { - /* Cannot recover, so abort the recovery */ - put_buf(r10_bio); - if (rb2) - atomic_dec(&rb2->remaining); - r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - printk(KERN_INFO "md/raid10:%s: insufficient " - "working devices for recovery.\n", - mdname(mddev)); + raid10_find_phys(conf, r10_bio); + + /* Need to check if the array will still be + * degraded + */ + for (j=0; j<conf->raid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { + still_degraded = 1; break; } + + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); + + for (j=0; j<conf->copies;j++) { + int d = r10_bio->devs[j].devnum; + if (!conf->mirrors[d].rdev || + !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) + continue; + /* This is where we read from */ + bio = r10_bio->devs[0].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = READ; + bio->bi_sector = r10_bio->devs[j].addr + + conf->mirrors[d].rdev->data_offset; + bio->bi_bdev = conf->mirrors[d].rdev->bdev; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + /* and we write to 'i' */ + + for (k=0; k<conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + BUG_ON(k == conf->copies); + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = r10_bio->devs[k].addr + + conf->mirrors[i].rdev->data_offset; + bio->bi_bdev = conf->mirrors[i].rdev->bdev; + + r10_bio->devs[0].devnum = d; + r10_bio->devs[1].devnum = i; + + break; + } + if (j == conf->copies) { + /* Cannot recover, so abort the recovery */ + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", + mdname(mddev)); + break; } + } if (biolist == NULL) { while (r10_bio) { r10bio_t *rb2 = r10_bio; @@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && - !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, + &mddev->recovery)) { /* We can skip this block */ *skipped = 1; return sync_blocks + sectors_skipped; @@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; i<conf->copies; i++) { int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio->bi_end_io) - rdev_dec_pending(conf->mirrors[d].rdev, mddev); + rdev_dec_pending(conf->mirrors[d].rdev, + mddev); } put_buf(r10_bio); biolist = NULL; @@ -2047,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i do { struct page *page; int len = PAGE_SIZE; - disk = 0; if (sector_nr + (len>>9) > max_sector) len = (max_sector - sector_nr) << 9; if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { + struct bio *bio2; page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0) == 0) { - /* stop here */ - struct bio *bio2; - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { - /* remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_size -= len; - bio2->bi_flags &= ~(1<< BIO_SEG_VALID); - } - goto bio_full; + if (bio_add_page(bio, page, len, 0)) + continue; + + /* stop here */ + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; + for (bio2 = biolist; + bio2 && bio2 != bio; + bio2 = bio2->bi_next) { + /* remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_size -= len; + bio2->bi_flags &= ~(1<< BIO_SEG_VALID); } - disk = i; + goto bio_full; } nr_sectors += len>>9; sector_nr += len>>9; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 49bf5f89143..346e69bfdab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1700,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) raid5_conf_t *conf = mddev->private; pr_debug("raid456: error called\n"); - if (!test_bit(Faulty, &rdev->flags)) { - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery was running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } - set_bit(Faulty, &rdev->flags); - printk(KERN_ALERT - "md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * if recovery was running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid:%s: Disk failure on %s, disabling device.\n" + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } /* @@ -3960,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse reshape_progress could change after + * Of course reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. @@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); - if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } |