diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 18 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 38 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 163 | ||||
-rw-r--r-- | drivers/md/dm-snap.h | 11 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 42 | ||||
-rw-r--r-- | drivers/md/dm.c | 46 | ||||
-rw-r--r-- | drivers/md/dm.h | 6 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 20 | ||||
-rw-r--r-- | drivers/md/md.c | 662 | ||||
-rw-r--r-- | drivers/md/multipath.c | 17 | ||||
-rw-r--r-- | drivers/md/raid0.c | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 30 | ||||
-rw-r--r-- | drivers/md/raid10.c | 34 | ||||
-rw-r--r-- | drivers/md/raid5.c | 796 |
18 files changed, 1155 insertions, 860 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index b26927ce889..ac89a5deaca 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde || test_bit(Faulty, &rdev->flags)) continue; - target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); + target = rdev->sb_start + offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { page->index = index; @@ -238,15 +238,47 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde } +static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +{ + /* Iterate the disks of an mddev, using rcu to protect access to the + * linked list, and raising the refcount of devices we return to ensure + * they don't disappear while in use. + * As devices are only added or removed when raid_disk is < 0 and + * nr_pending is 0 and In_sync is clear, the entries we return will + * still be in the same position on the list when we re-enter + * list_for_each_continue_rcu. + */ + struct list_head *pos; + rcu_read_lock(); + if (rdev == NULL) + /* start at the beginning */ + pos = &mddev->disks; + else { + /* release the previous rdev and start from there. */ + rdev_dec_pending(rdev, mddev); + pos = &rdev->same_set; + } + list_for_each_continue_rcu(pos, &mddev->disks) { + rdev = list_entry(pos, mdk_rdev_t, same_set); + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + /* this is a usable devices */ + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return rdev; + } + } + rcu_read_unlock(); + return NULL; +} + static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { - mdk_rdev_t *rdev; - struct list_head *tmp; + mdk_rdev_t *rdev = NULL; mddev_t *mddev = bitmap->mddev; - rdev_for_each(rdev, tmp, mddev) - if (test_bit(In_sync, &rdev->flags) - && !test_bit(Faulty, &rdev->flags)) { + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, @@ -260,32 +292,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + (long)(page->index * (PAGE_SIZE/512)) + size/512 > 0) /* bitmap runs in to metadata */ - return -EINVAL; + goto bad_alignment; if (rdev->data_offset + mddev->size*2 - > rdev->sb_offset*2 + bitmap->offset) + > rdev->sb_start + bitmap->offset) /* data runs in to bitmap */ - return -EINVAL; - } else if (rdev->sb_offset*2 < rdev->data_offset) { + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ - if (rdev->sb_offset*2 + if (rdev->sb_start + bitmap->offset + page->index*(PAGE_SIZE/512) + size/512 > rdev->data_offset) /* bitmap runs in to data */ - return -EINVAL; + goto bad_alignment; } else { /* DATA METADATA BITMAP - no problems */ } md_super_write(mddev, rdev, - (rdev->sb_offset<<1) + bitmap->offset + rdev->sb_start + bitmap->offset + page->index * (PAGE_SIZE/512), size, page); - } + } if (wait) md_super_wait(mddev); return 0; + + bad_alignment: + rcu_read_unlock(); + return -EINVAL; } static void bitmap_file_kick(struct bitmap *bitmap); @@ -454,8 +490,11 @@ void bitmap_update_sb(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); - if (!bitmap->mddev->degraded) - sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + if (bitmap->mddev->events < bitmap->events_cleared) { + /* rocking back to read-only */ + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + } kunmap_atomic(sb, KM_USER0); write_page(bitmap, bitmap->sb_page, 1); } @@ -1085,9 +1124,19 @@ void bitmap_daemon_work(struct bitmap *bitmap) } else spin_unlock_irqrestore(&bitmap->lock, flags); lastpage = page; -/* - printk("bitmap clean at page %lu\n", j); -*/ + + /* We are possibly going to clear some bits, so make + * sure that events_cleared is up-to-date. + */ + if (bitmap->need_sync) { + bitmap_super_t *sb; + bitmap->need_sync = 0; + sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb->events_cleared = + cpu_to_le64(bitmap->events_cleared); + kunmap_atomic(sb, KM_USER0); + write_page(bitmap, bitmap->sb_page, 1); + } spin_lock_irqsave(&bitmap->lock, flags); clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } @@ -1216,7 +1265,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect case 0: bitmap_file_set_bit(bitmap, offset); bitmap_count_page(bitmap,offset, 1); - blk_plug_device(bitmap->mddev->queue); + blk_plug_device_unlocked(bitmap->mddev->queue); /* fall through */ case 1: *bmc = 2; @@ -1257,6 +1306,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto return; } + if (success && + bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + } + if (!success && ! (*bmc & NEEDED_MASK)) *bmc |= NEEDED_MASK; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ab6a61db63c..13956437bc8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1216,9 +1216,24 @@ error: return -EINVAL; } +static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct crypt_config *cc = ti->private; + struct request_queue *q = bdev_get_queue(cc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = cc->dev->bdev; + bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static struct target_type crypt_target = { .name = "crypt", - .version= {1, 5, 0}, + .version= {1, 6, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, @@ -1228,6 +1243,7 @@ static struct target_type crypt_target = { .preresume = crypt_preresume, .resume = crypt_resume, .message = crypt_message, + .merge = crypt_merge, }; static int __init dm_crypt_init(void) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 17753d80ad2..6449bcdf84c 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -69,13 +69,25 @@ static void linear_dtr(struct dm_target *ti) kfree(lc); } -static int linear_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { - struct linear_c *lc = (struct linear_c *) ti->private; + struct linear_c *lc = ti->private; + + return lc->start + (bi_sector - ti->begin); +} + +static void linear_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct linear_c *lc = ti->private; bio->bi_bdev = lc->dev->bdev; - bio->bi_sector = lc->start + (bio->bi_sector - ti->begin); + bio->bi_sector = linear_map_sector(ti, bio->bi_sector); +} + +static int linear_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + linear_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } @@ -114,15 +126,31 @@ static int linear_ioctl(struct dm_target *ti, struct inode *inode, return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg); } +static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct linear_c *lc = ti->private; + struct request_queue *q = bdev_get_queue(lc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = lc->dev->bdev; + bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static struct target_type linear_target = { .name = "linear", - .version= {1, 0, 2}, + .version= {1, 0, 3}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, .map = linear_map, .status = linear_status, .ioctl = linear_ioctl, + .merge = linear_merge, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 67a6f31b7fc..5b48478c79f 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -831,7 +831,7 @@ static struct dm_dirty_log_type _disk_type = { .status = disk_status, }; -int __init dm_dirty_log_init(void) +static int __init dm_dirty_log_init(void) { int r; @@ -848,7 +848,7 @@ int __init dm_dirty_log_init(void) return r; } -void __exit dm_dirty_log_exit(void) +static void __exit dm_dirty_log_exit(void) { dm_dirty_log_type_unregister(&_disk_type); dm_dirty_log_type_unregister(&_core_type); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 9f7302d4878..71dd65aa31b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -147,9 +147,12 @@ static struct priority_group *alloc_priority_group(void) static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) { struct pgpath *pgpath, *tmp; + struct multipath *m = ti->private; list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { list_del(&pgpath->list); + if (m->hw_handler_name) + scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); dm_put_device(ti, pgpath->path.dev); free_pgpath(pgpath); } @@ -525,8 +528,10 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, } r = read_param(_params, shift(as), &ps_argc, &ti->error); - if (r) + if (r) { + dm_put_path_selector(pst); return -EINVAL; + } r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { @@ -546,6 +551,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, { int r; struct pgpath *p; + struct multipath *m = ti->private; /* we need at least a path arg */ if (as->argc < 1) { @@ -564,6 +570,15 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, goto bad; } + if (m->hw_handler_name) { + r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), + m->hw_handler_name); + if (r < 0) { + dm_put_device(ti, p->path.dev); + goto bad; + } + } + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); if (r) { dm_put_device(ti, p->path.dev); @@ -623,8 +638,10 @@ static struct priority_group *parse_priority_group(struct arg_set *as, struct pgpath *pgpath; struct arg_set path_args; - if (as->argc < nr_params) + if (as->argc < nr_params) { + ti->error = "not enough path parameters"; goto bad; + } path_args.argc = nr_params; path_args.argv = as->argv; @@ -867,7 +884,7 @@ static int reinstate_path(struct pgpath *pgpath) if (pgpath->path.is_active) goto out; - if (!pgpath->pg->ps.type) { + if (!pgpath->pg->ps.type->reinstate_path) { DMWARN("Reinstate path not supported by path selector %s", pgpath->pg->ps.type->name); r = -EINVAL; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1ba8a47d61b..6e5528aecc9 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -40,6 +40,11 @@ */ #define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) +/* + * The size of the mempool used to track chunks in use. + */ +#define MIN_IOS 256 + static struct workqueue_struct *ksnapd; static void flush_queued_bios(struct work_struct *work); @@ -91,7 +96,63 @@ struct dm_snap_pending_exception { */ static struct kmem_cache *exception_cache; static struct kmem_cache *pending_cache; -static mempool_t *pending_pool; + +struct dm_snap_tracked_chunk { + struct hlist_node node; + chunk_t chunk; +}; + +static struct kmem_cache *tracked_chunk_cache; + +static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s, + chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool, + GFP_NOIO); + unsigned long flags; + + c->chunk = chunk; + + spin_lock_irqsave(&s->tracked_chunk_lock, flags); + hlist_add_head(&c->node, + &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); + spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); + + return c; +} + +static void stop_tracking_chunk(struct dm_snapshot *s, + struct dm_snap_tracked_chunk *c) +{ + unsigned long flags; + + spin_lock_irqsave(&s->tracked_chunk_lock, flags); + hlist_del(&c->node); + spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); + + mempool_free(c, s->tracked_chunk_pool); +} + +static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c; + struct hlist_node *hn; + int found = 0; + + spin_lock_irq(&s->tracked_chunk_lock); + + hlist_for_each_entry(c, hn, + &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { + if (c->chunk == chunk) { + found = 1; + break; + } + } + + spin_unlock_irq(&s->tracked_chunk_lock); + + return found; +} /* * One of these per registered origin, held in the snapshot_origins hash @@ -302,14 +363,19 @@ static void free_exception(struct dm_snap_exception *e) kmem_cache_free(exception_cache, e); } -static struct dm_snap_pending_exception *alloc_pending_exception(void) +static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) { - return mempool_alloc(pending_pool, GFP_NOIO); + struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, + GFP_NOIO); + + pe->snap = s; + + return pe; } static void free_pending_exception(struct dm_snap_pending_exception *pe) { - mempool_free(pe, pending_pool); + mempool_free(pe, pe->snap->pending_pool); } static void insert_completed_exception(struct dm_snapshot *s, @@ -482,6 +548,7 @@ static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg, static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dm_snapshot *s; + int i; int r = -EINVAL; char persistent; char *origin_path; @@ -564,11 +631,30 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad5; } + s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); + if (!s->pending_pool) { + ti->error = "Could not allocate mempool for pending exceptions"; + goto bad6; + } + + s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, + tracked_chunk_cache); + if (!s->tracked_chunk_pool) { + ti->error = "Could not allocate tracked_chunk mempool for " + "tracking reads"; + goto bad_tracked_chunk_pool; + } + + for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); + + spin_lock_init(&s->tracked_chunk_lock); + /* Metadata must only be loaded into one table at once */ r = s->store.read_metadata(&s->store); if (r < 0) { ti->error = "Failed to read snapshot metadata"; - goto bad6; + goto bad_load_and_register; } else if (r > 0) { s->valid = 0; DMWARN("Snapshot is marked invalid."); @@ -582,7 +668,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (register_snapshot(s)) { r = -EINVAL; ti->error = "Cannot register snapshot origin"; - goto bad6; + goto bad_load_and_register; } ti->private = s; @@ -590,6 +676,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) return 0; + bad_load_and_register: + mempool_destroy(s->tracked_chunk_pool); + + bad_tracked_chunk_pool: + mempool_destroy(s->pending_pool); + bad6: dm_kcopyd_client_destroy(s->kcopyd_client); @@ -624,6 +716,9 @@ static void __free_exceptions(struct dm_snapshot *s) static void snapshot_dtr(struct dm_target *ti) { +#ifdef CONFIG_DM_DEBUG + int i; +#endif struct dm_snapshot *s = ti->private; flush_workqueue(ksnapd); @@ -632,8 +727,17 @@ static void snapshot_dtr(struct dm_target *ti) /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); +#ifdef CONFIG_DM_DEBUG + for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); +#endif + + mempool_destroy(s->tracked_chunk_pool); + __free_exceptions(s); + mempool_destroy(s->pending_pool); + dm_put_device(ti, s->origin); dm_put_device(ti, s->cow); @@ -772,6 +876,13 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) } /* + * Check for conflicting reads. This is extremely improbable, + * so yield() is sufficient and there is no need for a wait queue. + */ + while (__chunk_is_tracked(s, pe->e.old_chunk)) + yield(); + + /* * Add a proper exception, and remove the * in-flight exception from the list. */ @@ -873,7 +984,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) * to hold the lock while we do this. */ up_write(&s->lock); - pe = alloc_pending_exception(); + pe = alloc_pending_exception(s); down_write(&s->lock); if (!s->valid) { @@ -893,7 +1004,6 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) bio_list_init(&pe->snapshot_bios); pe->primary_pe = NULL; atomic_set(&pe->ref_count, 0); - pe->snap = s; pe->started = 0; if (s->store.prepare_exception(&s->store, &pe->e)) { @@ -974,14 +1084,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, start_copy(pe); goto out; } - } else - /* - * FIXME: this read path scares me because we - * always use the origin when we have a pending - * exception. However I can't think of a - * situation where this is wrong - ejt. - */ + } else { bio->bi_bdev = s->origin->bdev; + map_context->ptr = track_chunk(s, chunk); + } out_unlock: up_write(&s->lock); @@ -989,6 +1095,18 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, return r; } +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct dm_snapshot *s = ti->private; + struct dm_snap_tracked_chunk *c = map_context->ptr; + + if (c) + stop_tracking_chunk(s, c); + + return 0; +} + static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = ti->private; @@ -1266,6 +1384,7 @@ static struct target_type snapshot_target = { .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, + .end_io = snapshot_end_io, .resume = snapshot_resume, .status = snapshot_status, }; @@ -1306,9 +1425,9 @@ static int __init dm_snapshot_init(void) goto bad4; } - pending_pool = mempool_create_slab_pool(128, pending_cache); - if (!pending_pool) { - DMERR("Couldn't create pending pool."); + tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); + if (!tracked_chunk_cache) { + DMERR("Couldn't create cache to track chunks in use."); r = -ENOMEM; goto bad5; } @@ -1317,13 +1436,13 @@ static int __init dm_snapshot_init(void) if (!ksnapd) { DMERR("Failed to create ksnapd workqueue."); r = -ENOMEM; - goto bad6; + goto bad_pending_pool; } return 0; - bad6: - mempool_destroy(pending_pool); + bad_pending_pool: + kmem_cache_destroy(tracked_chunk_cache); bad5: kmem_cache_destroy(pending_cache); bad4: @@ -1352,9 +1471,9 @@ static void __exit dm_snapshot_exit(void) DMERR("origin unregister failed %d", r); exit_origin_hash(); - mempool_destroy(pending_pool); kmem_cache_destroy(pending_cache); kmem_cache_destroy(exception_cache); + kmem_cache_destroy(tracked_chunk_cache); } /* Module hooks */ diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index 24f9fb73b98..292c15609ae 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h @@ -130,6 +130,10 @@ struct exception_store { void *context; }; +#define DM_TRACKED_CHUNK_HASH_SIZE 16 +#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ + (DM_TRACKED_CHUNK_HASH_SIZE - 1)) + struct dm_snapshot { struct rw_semaphore lock; struct dm_target *ti; @@ -157,6 +161,8 @@ struct dm_snapshot { /* The last percentage we notified */ int last_percent; + mempool_t *pending_pool; + struct exception_table pending; struct exception_table complete; @@ -174,6 +180,11 @@ struct dm_snapshot { /* Queue of snapshot writes for ksnapd to flush */ struct bio_list queued_bios; struct work_struct queued_bios_work; + + /* Chunks with outstanding reads */ + mempool_t *tracked_chunk_pool; + spinlock_t tracked_chunk_lock; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; }; /* diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 94116eaf470..61f44140923 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -316,29 +316,12 @@ static inline int check_space(struct dm_table *t) */ static int lookup_device(const char *path, dev_t *dev) { - int r; - struct nameidata nd; - struct inode *inode; - - if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd))) - return r; - - inode = nd.path.dentry->d_inode; - if (!inode) { - r = -ENOENT; - goto out; - } - - if (!S_ISBLK(inode->i_mode)) { - r = -ENOTBLK; - goto out; - } - - *dev = inode->i_rdev; - - out: - path_put(&nd.path); - return r; + struct block_device *bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + *dev = bdev->bd_dev; + bdput(bdev); + return 0; } /* @@ -506,14 +489,13 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) rs->max_sectors = min_not_zero(rs->max_sectors, q->max_sectors); - /* FIXME: Device-Mapper on top of RAID-0 breaks because DM - * currently doesn't honor MD's merge_bvec_fn routine. - * In this case, we'll force DM to use PAGE_SIZE or - * smaller I/O, just to be safe. A better fix is in the - * works, but add this for the time being so it will at - * least operate correctly. + /* + * Check if merge fn is supported. + * If not we'll force DM to use PAGE_SIZE or + * smaller I/O, just to be safe. */ - if (q->merge_bvec_fn) + + if (q->merge_bvec_fn && !ti->type->merge) rs->max_sectors = min_not_zero(rs->max_sectors, (unsigned int) (PAGE_SIZE >> 9)); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 372369b1cc2..bca448e1187 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -37,8 +37,8 @@ static DEFINE_SPINLOCK(_minor_lock); struct dm_io { struct mapped_device *md; int error; - struct bio *bio; atomic_t io_count; + struct bio *bio; unsigned long start_time; }; @@ -829,6 +829,49 @@ static int __split_bio(struct mapped_device *md, struct bio *bio) * CRUD END *---------------------------------------------------------------*/ +static int dm_merge_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + struct dm_target *ti; + sector_t max_sectors; + int max_size; + + if (unlikely(!map)) + return 0; + + ti = dm_table_find_target(map, bvm->bi_sector); + + /* + * Find maximum amount of I/O that won't need splitting + */ + max_sectors = min(max_io_len(md, bvm->bi_sector, ti), + (sector_t) BIO_MAX_SECTORS); + max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; + if (max_size < 0) + max_size = 0; + + /* + * merge_bvec_fn() returns number of bytes + * it can accept at this offset + * max is precomputed maximal io size + */ + if (max_size && ti->type->merge) + max_size = ti->type->merge(ti, bvm, biovec, max_size); + + /* + * Always allow an entire first page + */ + if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) + max_size = biovec->bv_len; + + dm_table_put(map); + + return max_size; +} + /* * The request function that just remaps the bio built up by * dm_merge_bvec. @@ -1032,6 +1075,7 @@ static struct mapped_device *alloc_dev(int minor) blk_queue_make_request(md->queue, dm_request); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); md->queue->unplug_fn = dm_unplug_all; + blk_queue_merge_bvec(md->queue, dm_merge_bvec); md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); if (!md->io_pool) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 8c03b634e62..1e59a0b0a78 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -100,12 +100,6 @@ int dm_lock_for_deletion(struct mapped_device *md); void dm_kobject_uevent(struct mapped_device *md); -/* - * Dirty log - */ -int dm_dirty_log_init(void); -void dm_dirty_log_exit(void); - int dm_kcopyd_init(void); void dm_kcopyd_exit(void); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index d107ddceefc..268547dbfbd 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -297,7 +297,7 @@ static int run(mddev_t *mddev) rdev_for_each(rdev, tmp, mddev) conf->rdev = rdev; - mddev->array_size = mddev->size; + mddev->array_sectors = mddev->size * 2; mddev->private = conf; reconfig(mddev, mddev->layout, -1); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 6a866d7c8ae..b1eebf88c20 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) return NULL; cnt = 0; - conf->array_size = 0; + conf->array_sectors = 0; rdev_for_each(rdev, tmp, mddev) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; - if (j < 0 || j > raid_disks || disk->rdev) { + if (j < 0 || j >= raid_disks || disk->rdev) { printk("linear: disk numbering problem. Aborting!\n"); goto out; } @@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->size = rdev->size; - conf->array_size += rdev->size; + conf->array_sectors += rdev->size * 2; cnt++; } @@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) goto out; } - min_spacing = conf->array_size; + min_spacing = conf->array_sectors / 2; sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); /* min_spacing is the minimum spacing that will fit the hash @@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) * that is larger than min_spacing as use the size of that as * the actual spacing */ - conf->hash_spacing = conf->array_size; + conf->hash_spacing = conf->array_sectors / 2; for (i=0; i < cnt-1 ; i++) { sector_t sz = 0; int j; @@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) unsigned round; unsigned long base; - sz = conf->array_size >> conf->preshift; + sz = conf->array_sectors >> (conf->preshift + 1); sz += 1; /* force round-up */ base = conf->hash_spacing >> conf->preshift; round = sector_div(sz, base); @@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) curr_offset = 0; i = 0; for (curr_offset = 0; - curr_offset < conf->array_size; + curr_offset < conf->array_sectors / 2; curr_offset += conf->hash_spacing) { while (i < raid_disks-1 && @@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev) if (!conf) return 1; mddev->private = conf; - mddev->array_size = conf->array_size; + mddev->array_sectors = conf->array_sectors; blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_size = newconf->array_size; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = newconf->array_sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 2580ac1b9b0..deeac4b4417 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); } EXPORT_SYMBOL_GPL(md_new_event); @@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit) INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); atomic_set(&new->active, 1); + atomic_set(&new->openers, 0); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; + new->resync_min = 0; new->resync_max = MaxSector; new->level = LEVEL_NONE; @@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel) return NULL; } +/* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct block_device *bdev) { - sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; - return MD_NEW_SIZE_BLOCKS(size); + sector_t num_sectors = bdev->bd_inode->i_size / 512; + return MD_NEW_SIZE_SECTORS(num_sectors); } -static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) { - sector_t size; - - size = rdev->sb_offset; + sector_t num_sectors = rdev->sb_start; if (chunk_size) - size &= ~((sector_t)chunk_size/1024 - 1); - return size; + num_sectors &= ~((sector_t)chunk_size/512 - 1); + return num_sectors; } static int alloc_disk_sb(mdk_rdev_t * rdev) @@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) rdev->sb_page = alloc_page(GFP_KERNEL); if (!rdev->sb_page) { printk(KERN_ALERT "md: out of memory.\n"); - return -EINVAL; + return -ENOMEM; } return 0; @@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; - rdev->sb_offset = 0; + rdev->sb_start = 0; rdev->size = 0; } } @@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; return 0; @@ -543,17 +543,12 @@ fail: static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { - if ( (sb1->set_uuid0 == sb2->set_uuid0) && - (sb1->set_uuid1 == sb2->set_uuid1) && - (sb1->set_uuid2 == sb2->set_uuid2) && - (sb1->set_uuid3 == sb2->set_uuid3)) - - return 1; - - return 0; + return sb1->set_uuid0 == sb2->set_uuid0 && + sb1->set_uuid1 == sb2->set_uuid1 && + sb1->set_uuid2 == sb2->set_uuid2 && + sb1->set_uuid3 == sb2->set_uuid3; } - static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) { int ret; @@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) if (!tmp1 || !tmp2) { ret = 0; - printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); goto abort; } @@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) tmp1->nr_disks = 0; tmp2->nr_disks = 0; - if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) - ret = 0; - else - ret = 1; - + ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); abort: kfree(tmp1); kfree(tmp2); @@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) */ struct super_type { - char *name; - struct module *owner; - int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); - int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); - void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); + char *name; + struct module *owner; + int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, + int minor_version); + int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); + void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); + unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, + sector_t num_sectors); }; /* @@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; - sector_t sb_offset; /* - * Calculate the position of the superblock, + * Calculate the position of the superblock (512byte sectors), * it's at the end of the disk. * * It also happens to be a multiple of 4Kb. */ - sb_offset = calc_dev_sboffset(rdev->bdev); - rdev->sb_offset = sb_offset; + rdev->sb_start = calc_dev_sboffset(rdev->bdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_dev_size(rdev, sb->chunk_size); + rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; if (rdev->size < sb->size && sb->level > 1) /* "this cannot possibly happen" ... */ @@ -1004,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) } /* + * rdev_size_change for 0.90.0 + */ +static unsigned long long +super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +{ + if (num_sectors && num_sectors < rdev->mddev->size * 2) + return 0; /* component must fit device */ + if (rdev->mddev->bitmap_offset) + return 0; /* can't move bitmap */ + rdev->sb_start = calc_dev_sboffset(rdev->bdev); + if (!num_sectors || num_sectors > rdev->sb_start) + num_sectors = rdev->sb_start; + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + md_super_wait(rdev->mddev); + return num_sectors / 2; /* kB for sysfs */ +} + + +/* * version 1 superblock */ @@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; - sector_t sb_offset; + sector_t sb_start; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; /* - * Calculate the position of the superblock. + * Calculate the position of the superblock in 512byte sectors. * It is always aligned to a 4K boundary and * depeding on minor_version, it can be: * 0: At least 8K, but less than 12K, from end of device @@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) */ switch(minor_version) { case 0: - sb_offset = rdev->bdev->bd_inode->i_size >> 9; - sb_offset -= 8*2; - sb_offset &= ~(sector_t)(4*2-1); - /* convert from sectors to K */ - sb_offset /= 2; + sb_start = rdev->bdev->bd_inode->i_size >> 9; + sb_start -= 8*2; + sb_start &= ~(sector_t)(4*2-1); break; case 1: - sb_offset = 0; + sb_start = 0; break; case 2: - sb_offset = 4; + sb_start = 8; break; default: return -EINVAL; } - rdev->sb_offset = sb_offset; + rdev->sb_start = sb_start; /* superblock is rarely larger than 1K, but it can be larger, * and it is safe to read 4k, so we do that @@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || - le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || + le64_to_cpu(sb->super_offset) != rdev->sb_start || (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; @@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->sb_size = (rdev->sb_size | bmask) + 1; if (minor_version - && rdev->data_offset < sb_offset + (rdev->sb_size/512)) + && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) @@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (minor_version) rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; else - rdev->size = rdev->sb_offset; + rdev->size = rdev->sb_start / 2; if (rdev->size < le64_to_cpu(sb->data_size)/2) return -EINVAL; rdev->size = le64_to_cpu(sb->data_size)/2; @@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->sb_csum = calc_sb_1_csum(sb); } +static unsigned long long +super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +{ + struct mdp_superblock_1 *sb; + sector_t max_sectors; + if (num_sectors && num_sectors < rdev->mddev->size * 2) + return 0; /* component must fit device */ + if (rdev->sb_start < rdev->data_offset) { + /* minor versions 1 and 2; superblock before data */ + max_sectors = rdev->bdev->bd_inode->i_size >> 9; + max_sectors -= rdev->data_offset; + if (!num_sectors || num_sectors > max_sectors) + num_sectors = max_sectors; + } else if (rdev->mddev->bitmap_offset) { + /* minor version 0 with bitmap we can't move */ + return 0; + } else { + /* minor version 0; superblock after data */ + sector_t sb_start; + sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; + sb_start &= ~(sector_t)(4*2 - 1); + max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + if (!num_sectors || num_sectors > max_sectors) + num_sectors = max_sectors; + rdev->sb_start = sb_start; + } + sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); + sb->data_size = cpu_to_le64(num_sectors); + sb->super_offset = rdev->sb_start; + sb->sb_csum = calc_sb_1_csum(sb); + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + md_super_wait(rdev->mddev); + return num_sectors / 2; /* kB for sysfs */ +} static struct super_type super_types[] = { [0] = { .name = "0.90.0", .owner = THIS_MODULE, - .load_super = super_90_load, - .validate_super = super_90_validate, - .sync_super = super_90_sync, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + .rdev_size_change = super_90_rdev_size_change, }, [1] = { .name = "md-1", .owner = THIS_MODULE, - .load_super = super_1_load, - .validate_super = super_1_validate, - .sync_super = super_1_sync, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + .rdev_size_change = super_1_rdev_size_change, }, }; static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) { - struct list_head *tmp, *tmp2; mdk_rdev_t *rdev, *rdev2; - rdev_for_each(rdev, tmp, mddev1) - rdev_for_each(rdev2, tmp2, mddev2) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev1) + rdev_for_each_rcu(rdev2, mddev2) if (rdev->bdev->bd_contains == - rdev2->bdev->bd_contains) + rdev2->bdev->bd_contains) { + rcu_read_unlock(); return 1; - + } + rcu_read_unlock(); return 0; } @@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) kobject_del(&rdev->kobj); goto fail; } - list_add(&rdev->same_set, &mddev->disks); + list_add_rcu(&rdev->same_set, &mddev->disks); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); return 0; @@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) return; } bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); - list_del_init(&rdev->same_set); + list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state" + * writing to 'remove' to "dev/state". We also need + * to delay it due to rcu usage. */ + synchronize_rcu(); INIT_WORK(&rdev->del_work, md_delayed_delete); kobject_get(&rdev->kobj); schedule_work(&rdev->del_work); @@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) if (rdev->mddev) MD_BUG(); free_disk_sb(rdev); - list_del_init(&rdev->same_set); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); @@ -1758,11 +1808,11 @@ repeat: dprintk("%s ", bdevname(rdev->bdev,b)); if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, - rdev->sb_offset<<1, rdev->sb_size, + rdev->sb_start, rdev->sb_size, rdev->sb_page); dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), - (unsigned long long)rdev->sb_offset); + (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; } else @@ -1787,7 +1837,7 @@ repeat: } -/* words written to sysfs files may, or my not, be \n terminated. +/* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. */ static int cmd_match(const char *cmd, const char *str) @@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) err = 0; } + if (!err) + sysfs_notify(&rdev->kobj, NULL, "state"); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = @@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) slot = -1; else if (e==buf || (*e && *e!= '\n')) return -EINVAL; - if (rdev->mddev->pers) { + if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating * with the personality with ->hot_*_disk. @@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) * failed/spare devices. This normally happens automatically, * but not when the metadata is externally managed. */ - if (slot != -1) - return -EBUSY; if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ @@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) sysfs_remove_link(&rdev->mddev->kobj, nm); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); + } else if (rdev->mddev->pers) { + mdk_rdev_t *rdev2; + struct list_head *tmp; + /* Activating a spare .. or possibly reactivating + * if we every get bitmaps working here. + */ + + if (rdev->raid_disk != -1) + return -EBUSY; + + if (rdev->mddev->pers->hot_add_disk == NULL) + return -EINVAL; + + rdev_for_each(rdev2, tmp, rdev->mddev) + if (rdev2->raid_disk == slot) + return -EEXIST; + + rdev->raid_disk = slot; + if (test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = slot; + else + rdev->saved_raid_disk = -1; + err = rdev->mddev->pers-> + hot_add_disk(rdev->mddev, rdev); + if (err) { + rdev->raid_disk = -1; + return err; + } else + sysfs_notify(&rdev->kobj, NULL, "state"); + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) + printk(KERN_WARNING + "md: cannot register " + "%s for %s\n", + nm, mdname(rdev->mddev)); + + /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks) return -ENOSPC; @@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); + sysfs_notify(&rdev->kobj, NULL, "state"); } return len; } @@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) unsigned long long offset = simple_strtoull(buf, &e, 10); if (e==buf || (*e && *e != '\n')) return -EINVAL; - if (rdev->mddev->pers) + if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; if (rdev->size && rdev->mddev->external) /* Must set offset before size, so overlap checks @@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { - char *e; - unsigned long long size = simple_strtoull(buf, &e, 10); + unsigned long long size; unsigned long long oldsize = rdev->size; mddev_t *my_mddev = rdev->mddev; - if (e==buf || (*e && *e != '\n')) + if (strict_strtoull(buf, 10, &size) < 0) return -EINVAL; - if (my_mddev->pers) - return -EBUSY; + if (size < my_mddev->size) + return -EINVAL; + if (my_mddev->pers && rdev->raid_disk >= 0) { + if (my_mddev->persistent) { + size = super_types[my_mddev->major_version]. + rdev_size_change(rdev, size * 2); + if (!size) + return -EBUSY; + } else if (!size) { + size = (rdev->bdev->bd_inode->i_size >> 10); + size -= rdev->data_offset/2; + } + if (size < my_mddev->size) + return -EINVAL; /* component must fit device */ + } + rdev->size = size; - if (size > oldsize && rdev->mddev->external) { + if (size > oldsize && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid * a deadlock. We have already changed rdev->size, and if @@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (test_bit(AllReserved, &rdev2->flags) || (rdev->bdev == rdev2->bdev && rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size, - rdev2->data_offset, rdev2->size))) { + overlaps(rdev->data_offset, rdev->size * 2, + rdev2->data_offset, + rdev2->size * 2))) { overlap = 1; break; } @@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EBUSY; } } - if (size < my_mddev->size || my_mddev->size == 0) - my_mddev->size = size; return len; } @@ -2293,6 +2393,8 @@ static void analyze_sbs(mddev_t * mddev) } +static void md_safemode_timeout(unsigned long data); + static ssize_t safe_delay_show(mddev_t *mddev, char *page) { @@ -2332,9 +2434,12 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) if (msec == 0) mddev->safemode_delay = 0; else { + unsigned long old_delay = mddev->safemode_delay; mddev->safemode_delay = (msec*HZ)/1000; if (mddev->safemode_delay == 0) mddev->safemode_delay = 1; + if (mddev->safemode_delay < old_delay) + md_safemode_timeout((unsigned long)mddev); } return len; } @@ -2512,7 +2617,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); * When written, doesn't tear down array, but just stops it * suspended (not supported yet) * All IO requests will block. The array can be reconfigured. - * Writing this, if accepted, will block until array is quiessent + * Writing this, if accepted, will block until array is quiescent * readonly * no resync can happen. no superblocks get written. * write requests fail @@ -2585,7 +2690,7 @@ array_state_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(mddev_t * mddev, int ro); +static int do_md_stop(mddev_t * mddev, int ro, int is_open); static int do_md_run(mddev_t * mddev); static int restart_array(mddev_t *mddev); @@ -2599,16 +2704,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (atomic_read(&mddev->active) > 1) + if (atomic_read(&mddev->openers) > 0) return -EBUSY; - err = do_md_stop(mddev, 0); + err = do_md_stop(mddev, 0, 0); break; case inactive: /* stopping an active array */ if (mddev->pers) { - if (atomic_read(&mddev->active) > 1) + if (atomic_read(&mddev->openers) > 0) return -EBUSY; - err = do_md_stop(mddev, 2); + err = do_md_stop(mddev, 2, 0); } else err = 0; /* already inactive */ break; @@ -2616,7 +2721,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; /* not supported yet */ case readonly: if (mddev->pers) - err = do_md_stop(mddev, 1); + err = do_md_stop(mddev, 1, 0); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -2626,7 +2731,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (mddev->ro != 1) - err = do_md_stop(mddev, 1); + err = do_md_stop(mddev, 1, 0); else err = restart_array(mddev); if (err == 0) { @@ -2681,8 +2786,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) } if (err) return err; - else + else { + sysfs_notify(&mddev->kobj, NULL, "array_state"); return len; + } } static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); @@ -2785,7 +2892,7 @@ size_show(mddev_t *mddev, char *page) return sprintf(page, "%llu\n", (unsigned long long)mddev->size); } -static int update_size(mddev_t *mddev, unsigned long size); +static int update_size(mddev_t *mddev, sector_t num_sectors); static ssize_t size_store(mddev_t *mddev, const char *buf, size_t len) @@ -2802,7 +2909,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; if (mddev->pers) { - err = update_size(mddev, size); + err = update_size(mddev, size * 2); md_update_sb(mddev, 1); } else { if (mddev->size == 0 || @@ -2899,7 +3006,7 @@ action_show(mddev_t *mddev, char *page) type = "check"; else type = "repair"; - } else + } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) type = "recover"; } return sprintf(page, "%s\n", type); @@ -2921,15 +3028,19 @@ action_store(mddev_t *mddev, const char *page, size_t len) } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; - else if (cmd_match(page, "resync") || cmd_match(page, "recover")) + else if (cmd_match(page, "resync")) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - else if (cmd_match(page, "reshape")) { + else if (cmd_match(page, "recover")) { + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } else if (cmd_match(page, "reshape")) { int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; err = mddev->pers->start_reshape(mddev); if (err) return err; + sysfs_notify(&mddev->kobj, NULL, "degraded"); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -2940,6 +3051,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + sysfs_notify(&mddev->kobj, NULL, "sync_action"); return len; } @@ -3049,11 +3161,11 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; - resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); - dt = ((jiffies - mddev->resync_mark) / HZ); + resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); + dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; - db = resync - (mddev->resync_mark_cnt); - return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ + db = resync - mddev->resync_mark_cnt; + return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ } static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); @@ -3075,6 +3187,36 @@ sync_completed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t +min_sync_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_min); +} +static ssize_t +min_sync_store(mddev_t *mddev, const char *buf, size_t len) +{ + unsigned long long min; + if (strict_strtoull(buf, 10, &min)) + return -EINVAL; + if (min > mddev->resync_max) + return -EINVAL; + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev->chunk_size) { + if (min & (sector_t)((mddev->chunk_size>>9)-1)) + return -EINVAL; + } + mddev->resync_min = min; + + return len; +} + +static struct md_sysfs_entry md_min_sync = +__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); + +static ssize_t max_sync_show(mddev_t *mddev, char *page) { if (mddev->resync_max == MaxSector) @@ -3089,9 +3231,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { - char *ep; - unsigned long long max = simple_strtoull(buf, &ep, 10); - if (ep == buf || (*ep != 0 && *ep != '\n')) + unsigned long long max; + if (strict_strtoull(buf, 10, &max)) + return -EINVAL; + if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -3222,6 +3365,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, + &md_min_sync.attr, &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, @@ -3326,9 +3470,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) disk->queue = mddev->queue; add_disk(disk); mddev->gendisk = disk; - mutex_unlock(&disks_mutex); error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, "%s", "md"); + mutex_unlock(&disks_mutex); if (error) printk(KERN_WARNING "md: cannot register %s/md - name in use\n", disk->disk_name); @@ -3341,7 +3485,11 @@ static void md_safemode_timeout(unsigned long data) { mddev_t *mddev = (mddev_t *) data; - mddev->safemode = 1; + if (!atomic_read(&mddev->writes_pending)) { + mddev->safemode = 1; + if (mddev->external) + set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags); + } md_wakeup_thread(mddev->thread); } @@ -3432,22 +3580,23 @@ static int do_md_run(mddev_t * mddev) * We don't want the data to overlap the metadata, * Internal Bitmap issues has handled elsewhere. */ - if (rdev->data_offset < rdev->sb_offset) { + if (rdev->data_offset < rdev->sb_start) { if (mddev->size && rdev->data_offset + mddev->size*2 - > rdev->sb_offset*2) { + > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); return -EINVAL; } } else { - if (rdev->sb_offset*2 + rdev->sb_size/512 + if (rdev->sb_start + rdev->sb_size/512 > rdev->data_offset) { printk("md: %s: metadata overlaps data\n", mdname(mddev)); return -EINVAL; } } + sysfs_notify(&rdev->kobj, NULL, "state"); } md_probe(mddev->unit, NULL, NULL); @@ -3519,7 +3668,9 @@ static int do_md_run(mddev_t * mddev) mddev->ro = 2; /* read-only, but switch on first write */ err = mddev->pers->run(mddev); - if (!err && mddev->pers->sync_request) { + if (err) + printk(KERN_ERR "md: pers->run() failed ...\n"); + else if (mddev->pers->sync_request) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -3528,7 +3679,6 @@ static int do_md_run(mddev_t * mddev) } } if (err) { - printk(KERN_ERR "md: pers->run() failed ...\n"); module_put(mddev->pers->owner); mddev->pers = NULL; bitmap_destroy(mddev); @@ -3563,7 +3713,7 @@ static int do_md_run(mddev_t * mddev) if (mddev->flags) md_update_sb(mddev, 0); - set_capacity(disk, mddev->array_size<<1); + set_capacity(disk, mddev->array_sectors); /* If we call blk_queue_make_request here, it will * re-initialise max_sectors etc which may have been @@ -3608,6 +3758,9 @@ static int do_md_run(mddev_t * mddev) mddev->changed = 1; md_new_event(mddev); + sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify(&mddev->kobj, NULL, "sync_action"); + sysfs_notify(&mddev->kobj, NULL, "degraded"); kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); return 0; } @@ -3615,38 +3768,25 @@ static int do_md_run(mddev_t * mddev) static int restart_array(mddev_t *mddev) { struct gendisk *disk = mddev->gendisk; - int err; - /* - * Complain if it has no devices - */ - err = -ENXIO; + /* Complain if it has no devices */ if (list_empty(&mddev->disks)) - goto out; - - if (mddev->pers) { - err = -EBUSY; - if (!mddev->ro) - goto out; - - mddev->safemode = 0; - mddev->ro = 0; - set_disk_ro(disk, 0); - - printk(KERN_INFO "md: %s switched to read-write mode.\n", - mdname(mddev)); - /* - * Kick recovery or resync if necessary - */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); - err = 0; - } else - err = -EINVAL; - -out: - return err; + return -ENXIO; + if (!mddev->pers) + return -EINVAL; + if (!mddev->ro) + return -EBUSY; + mddev->safemode = 0; + mddev->ro = 0; + set_disk_ro(disk, 0); + printk(KERN_INFO "md: %s switched to read-write mode.\n", + mdname(mddev)); + /* Kick recovery or resync if necessary */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); + sysfs_notify(&mddev->kobj, NULL, "array_state"); + return 0; } /* similar to deny_write_access, but accounts for our holding a reference @@ -3680,16 +3820,17 @@ static void restore_bitmap_write_access(struct file *file) * 1 - switch to readonly * 2 - stop but do not disassemble array */ -static int do_md_stop(mddev_t * mddev, int mode) +static int do_md_stop(mddev_t * mddev, int mode, int is_open) { int err = 0; struct gendisk *disk = mddev->gendisk; + if (atomic_read(&mddev->openers) > is_open) { + printk("md: %s still in use.\n",mdname(mddev)); + return -EBUSY; + } + if (mddev->pers) { - if (atomic_read(&mddev->active)>2) { - printk("md: %s still in use.\n",mdname(mddev)); - return -EBUSY; - } if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -3700,8 +3841,6 @@ static int do_md_stop(mddev_t * mddev, int mode) del_timer_sync(&mddev->safemode_timer); - invalidate_partition(disk, 0); - switch(mode) { case 1: /* readonly */ err = -ENXIO; @@ -3773,10 +3912,11 @@ static int do_md_stop(mddev_t * mddev, int mode) export_array(mddev); - mddev->array_size = 0; + mddev->array_sectors = 0; mddev->size = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; + mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; mddev->external = 0; @@ -3811,6 +3951,7 @@ static int do_md_stop(mddev_t * mddev, int mode) mdname(mddev)); err = 0; md_new_event(mddev); + sysfs_notify(&mddev->kobj, NULL, "array_state"); out: return err; } @@ -3836,7 +3977,7 @@ static void autorun_array(mddev_t *mddev) err = do_md_run (mddev); if (err) { printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop (mddev, 0); + do_md_stop (mddev, 0, 0); } } @@ -3927,8 +4068,10 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - rdev_for_each_list(rdev, tmp, candidates) + rdev_for_each_list(rdev, tmp, candidates) { + list_del_init(&rdev->same_set); export_rdev(rdev); + } mddev_put(mddev); } printk(KERN_INFO "md: ... autorun DONE.\n"); @@ -4009,9 +4152,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) char *ptr, *buf = NULL; int err = -ENOMEM; - md_allow_write(mddev); + if (md_allow_write(mddev)) + file = kmalloc(sizeof(*file), GFP_NOIO); + else + file = kmalloc(sizeof(*file), GFP_KERNEL); - file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out; @@ -4044,15 +4189,12 @@ out: static int get_disk_info(mddev_t * mddev, void __user * arg) { mdu_disk_info_t info; - unsigned int nr; mdk_rdev_t *rdev; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - nr = info.number; - - rdev = find_rdev_nr(mddev, nr); + rdev = find_rdev_nr(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -4172,8 +4314,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) } if (err) export_rdev(rdev); + else + sysfs_notify(&rdev->kobj, NULL, "state"); md_update_sb(mddev, 1); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); return err; @@ -4212,10 +4358,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!mddev->persistent) { printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else - rdev->sb_offset = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_dev_size(rdev, mddev->chunk_size); + rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4232,9 +4378,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev; - if (!mddev->pers) - return -ENODEV; - rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; @@ -4257,7 +4400,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; int err; - unsigned int size; mdk_rdev_t *rdev; if (!mddev->pers) @@ -4285,13 +4427,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev->bdev); else - rdev->sb_offset = - rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - size = calc_dev_size(rdev, mddev->chunk_size); - rdev->size = size; + rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4476,44 +4616,50 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } -static int update_size(mddev_t *mddev, unsigned long size) +static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t * rdev; int rv; struct list_head *tmp; - int fit = (size == 0); + int fit = (num_sectors == 0); if (mddev->pers->resize == NULL) return -EINVAL; - /* The "size" is the amount of each device that is used. - * This can only make sense for arrays with redundancy. - * linear and raid0 always use whatever space is available - * We can only consider changing the size if no resync - * or reconstruction is happening, and if the new size - * is acceptable. It must fit before the sb_offset or, - * if that is <data_offset, it must fit before the - * size of each device. - * If size is zero, we find the largest size that fits. + /* The "num_sectors" is the number of sectors of each device that + * is used. This can only make sense for arrays with redundancy. + * linear and raid0 always use whatever space is available. We can only + * consider changing this number if no resync or reconstruction is + * happening, and if the new size is acceptable. It must fit before the + * sb_start or, if that is <data_offset, it must fit before the size + * of each device. If num_sectors is zero, we find the largest size + * that fits. + */ if (mddev->sync_thread) return -EBUSY; + if (mddev->bitmap) + /* Sorry, cannot grow a bitmap yet, just remove it, + * grow, and re-add. + */ + return -EBUSY; rdev_for_each(rdev, tmp, mddev) { sector_t avail; avail = rdev->size * 2; - if (fit && (size == 0 || size > avail/2)) - size = avail/2; - if (avail < ((sector_t)size << 1)) + if (fit && (num_sectors == 0 || num_sectors > avail)) + num_sectors = avail; + if (avail < num_sectors) return -ENOSPC; } - rv = mddev->pers->resize(mddev, (sector_t)size *2); + rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { struct block_device *bdev; bdev = bdget_disk(mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } @@ -4588,7 +4734,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) return mddev->pers->reconfig(mddev, info->layout, -1); } if (info->size >= 0 && mddev->size != info->size) - rv = update_size(mddev, info->size); + rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) rv = update_raid_disks(mddev, info->raid_disks); @@ -4641,6 +4787,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) return 0; } +/* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) { mddev_t *mddev = bdev->bd_disk->private_data; @@ -4785,19 +4937,13 @@ static int md_ioctl(struct inode *inode, struct file *file, goto done_unlock; case STOP_ARRAY: - err = do_md_stop (mddev, 0); + err = do_md_stop (mddev, 0, 1); goto done_unlock; case STOP_ARRAY_RO: - err = do_md_stop (mddev, 1); + err = do_md_stop (mddev, 1, 1); goto done_unlock; - /* - * We have a problem here : there is no easy way to give a CHS - * virtual geometry. We currently pretend that we have a 2 heads - * 4 sectors (with a BIG number of cylinders...). This drives - * dosfs just mad... ;-) - */ } /* @@ -4807,13 +4953,12 @@ static int md_ioctl(struct inode *inode, struct file *file, * here and hit the 'default' below, so only disallow * 'md' ioctls, and switch to rw mode if started auto-readonly. */ - if (_IOC_TYPE(cmd) == MD_MAJOR && - mddev->ro && mddev->pers) { + if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - + sysfs_notify(&mddev->kobj, NULL, "array_state"); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); } else { err = -EROFS; goto abort_unlock; @@ -4883,6 +5028,7 @@ static int md_open(struct inode *inode, struct file *file) err = 0; mddev_get(mddev); + atomic_inc(&mddev->openers); mddev_unlock(mddev); check_disk_change(inode->i_bdev); @@ -4895,6 +5041,7 @@ static int md_release(struct inode *inode, struct file * file) mddev_t *mddev = inode->i_bdev->bd_disk->private_data; BUG_ON(!mddev); + atomic_dec(&mddev->openers); mddev_put(mddev); return 0; @@ -5029,6 +5176,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(StateChanged, &rdev->flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -5258,10 +5408,11 @@ static int md_seq_show(struct seq_file *seq, void *v) if (!list_empty(&mddev->disks)) { if (mddev->pers) seq_printf(seq, "\n %llu blocks", - (unsigned long long)mddev->array_size); + (unsigned long long) + mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)size); } if (mddev->persistent) { if (mddev->major_version != 0 || @@ -5391,12 +5542,12 @@ int unregister_md_personality(struct mdk_personality *p) static int is_mddev_idle(mddev_t *mddev) { mdk_rdev_t * rdev; - struct list_head *tmp; int idle; long curr_events; idle = 1; - rdev_for_each(rdev, tmp, mddev) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, sectors[0]) + disk_stat_read(disk, sectors[1]) - @@ -5428,6 +5579,7 @@ static int is_mddev_idle(mddev_t *mddev) idle = 0; } } + rcu_read_unlock(); return idle; } @@ -5451,6 +5603,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) */ void md_write_start(mddev_t *mddev, struct bio *bi) { + int did_change = 0; if (bio_data_dir(bi) != WRITE) return; @@ -5461,6 +5614,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); + did_change = 1; } atomic_inc(&mddev->writes_pending); if (mddev->safemode == 1) @@ -5471,10 +5625,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); md_wakeup_thread(mddev->thread); + did_change = 1; } spin_unlock_irq(&mddev->write_lock); - sysfs_notify(&mddev->kobj, NULL, "array_state"); } + if (did_change) + sysfs_notify(&mddev->kobj, NULL, "array_state"); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); @@ -5495,13 +5651,18 @@ void md_write_end(mddev_t *mddev) * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. + * + * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock + * is dropped, so return -EAGAIN after notifying userspace. */ -void md_allow_write(mddev_t *mddev) +int md_allow_write(mddev_t *mddev) { if (!mddev->pers) - return; + return 0; if (mddev->ro) - return; + return 0; + if (!mddev->pers->sync_request) + return 0; spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { @@ -5512,14 +5673,14 @@ void md_allow_write(mddev_t *mddev) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify(&mddev->kobj, NULL, "array_state"); - /* wait for the dirty state to be recorded in the metadata */ - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } else spin_unlock_irq(&mddev->write_lock); + + if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + return -EAGAIN; + else + return 0; } EXPORT_SYMBOL_GPL(md_allow_write); @@ -5600,7 +5761,11 @@ void md_do_sync(mddev_t *mddev) * time 'round when curr_resync == 2 */ continue; - prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); + /* We need to wait 'interruptible' so as not to + * contribute to the load average, and not to + * be caught by 'softlockup' + */ + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!kthread_should_stop() && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" @@ -5608,6 +5773,8 @@ void md_do_sync(mddev_t *mddev) " share one or more physical units)\n", desc, mdname(mddev), mdname(mddev2)); mddev_put(mddev2); + if (signal_pending(current)) + flush_signals(current); schedule(); finish_wait(&resync_wait, &wq); goto try_again; @@ -5625,9 +5792,11 @@ void md_do_sync(mddev_t *mddev) max_sectors = mddev->resync_max_sectors; mddev->resync_mismatches = 0; /* we don't use the checkpoint if there's a bitmap */ - if (!mddev->bitmap && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + j = mddev->resync_min; + else if (!mddev->bitmap) j = mddev->recovery_cp; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->size << 1; else { @@ -5796,6 +5965,7 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; + mddev->resync_min = 0; mddev->resync_max = MaxSector; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); @@ -5837,15 +6007,17 @@ static int remove_and_add_spares(mddev_t *mddev) } } - if (mddev->degraded) { + if (mddev->degraded && ! mddev->ro) { rdev_for_each(rdev, rtmp, mddev) { if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags)) + !test_bit(In_sync, &rdev->flags) && + !test_bit(Blocked, &rdev->flags)) spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; - if (mddev->pers->hot_add_disk(mddev,rdev)) { + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, @@ -5894,6 +6066,9 @@ void md_check_recovery(mddev_t *mddev) if (mddev->bitmap) bitmap_daemon_work(mddev->bitmap); + if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags)) + sysfs_notify(&mddev->kobj, NULL, "array_state"); + if (mddev->ro) return; @@ -5906,6 +6081,8 @@ void md_check_recovery(mddev_t *mddev) flush_signals(current); } + if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return; if ( ! ( (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || @@ -5919,24 +6096,41 @@ void md_check_recovery(mddev_t *mddev) if (mddev_trylock(mddev)) { int spares = 0; + if (mddev->ro) { + /* Only thing we do on a ro array is remove + * failed devices. + */ + remove_and_add_spares(mddev); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + if (!mddev->external) { + int did_change = 0; spin_lock_irq(&mddev->write_lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; + did_change = 1; if (mddev->persistent) set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); + if (did_change) + sysfs_notify(&mddev->kobj, NULL, "array_state"); } if (mddev->flags) md_update_sb(mddev, 0); + rdev_for_each(rdev, rtmp, mddev) + if (test_and_clear_bit(StateChanged, &rdev->flags)) + sysfs_notify(&rdev->kobj, NULL, "state"); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -5948,10 +6142,13 @@ void md_check_recovery(mddev_t *mddev) /* resync has finished, collect result */ md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* success...*/ /* activate any spares */ - mddev->pers->spare_active(mddev); + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); } md_update_sb(mddev, 1); @@ -5965,13 +6162,18 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify(&mddev->kobj, NULL, "sync_action"); md_new_event(mddev); goto unlock; } + /* Set RUNNING before clearing NEEDED to avoid + * any transients in the value of "sync_action". + */ + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); @@ -5989,17 +6191,20 @@ void md_check_recovery(mddev_t *mddev) /* Cannot proceed */ goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if ((spares = remove_and_add_spares(mddev))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) /* nothing to be done ... */ goto unlock; if (mddev->pers->sync_request) { - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (spares && mddev->bitmap && ! mddev->bitmap->file) { /* We are adding a device or devices to an array * which has the bitmap stored on all devices. @@ -6018,9 +6223,16 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; } else md_wakeup_thread(mddev->sync_thread); + sysfs_notify(&mddev->kobj, NULL, "sync_action"); md_new_event(mddev); } unlock: + if (!mddev->sync_thread) { + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, + &mddev->recovery)) + sysfs_notify(&mddev->kobj, NULL, "sync_action"); + } mddev_unlock(mddev); } } @@ -6047,7 +6259,11 @@ static int md_notify_reboot(struct notifier_block *this, for_each_mddev(mddev, tmp) if (mddev_trylock(mddev)) { - do_md_stop (mddev, 1); + /* Force a switch to readonly even array + * appears to still be in use. Hence + * the '100'. + */ + do_md_stop (mddev, 1, 100); mddev_unlock(mddev); } /* diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e968116e0de..c4779ccba1c 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { multipath_conf_t *conf = mddev->private; struct request_queue *q; - int found = 0; + int err = -EEXIST; int path; struct multipath_info *p; + int first = 0; + int last = mddev->raid_disks - 1; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; print_multipath_conf(conf); - for (path=0; path<mddev->raid_disks; path++) + for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { q = rdev->bdev->bd_disk->queue; blk_queue_stack_limits(mddev->queue, q); @@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); rcu_assign_pointer(p->rdev, rdev); - found = 1; + err = 0; + break; } print_multipath_conf(conf); - return found; + + return err; } static int multipath_remove_disk(mddev_t *mddev, int number) @@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_size = mddev->size; + mddev->array_sectors = mddev->size * 2; mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bcbb82594a1..18361063566 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev) goto out_free_conf; /* calculate array device size */ - mddev->array_size = 0; + mddev->array_sectors = 0; rdev_for_each(rdev, tmp, mddev) - mddev->array_size += rdev->size; + mddev->array_sectors += rdev->size * 2; printk("raid0 : md_size is %llu blocks.\n", - (unsigned long long)mddev->array_size); + (unsigned long long)mddev->array_sectors / 2); printk("raid0 : conf->hash_spacing is %llu blocks.\n", (unsigned long long)conf->hash_spacing); { - sector_t s = mddev->array_size; + sector_t s = mddev->array_sectors / 2; sector_t space = conf->hash_spacing; int round; conf->preshift = 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c610b947218..03a5ab705c2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror = 0; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) } print_conf(conf); - return found; + return err; } static int raid1_remove_disk(mddev_t *mddev, int number) @@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_size = mddev->size; + mddev->array_sectors = mddev->size * 2; mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_size = sectors>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { + if (mddev->array_sectors / 2 > mddev->size && + mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_size; + mddev->size = mddev->array_sectors / 2; mddev->resync_max_sectors = sectors; return 0; } @@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev) conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; unsigned long flags; - int d, d2; + int d, d2, err; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_size != mddev->new_chunk || @@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev) return -EINVAL; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; raid_disks = mddev->raid_disks + mddev->delta_disks; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 22bb2b1b886..e34cd0e6247 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -76,11 +76,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +/* Maximum size of each resync request */ #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE -#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +/* amount of memory to reserve for resync requests */ +#define RESYNC_WINDOW (1024*1024) +/* maximum number of concurrent requests, memory permitting */ +#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) /* * When performing a resync, we need to read and compare, so @@ -215,6 +217,9 @@ static void reschedule_retry(r10bio_t *r10_bio) conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); + /* wake up frozen array... */ + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); } @@ -687,7 +692,6 @@ static int flush_pending_writes(conf_t *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 static void raise_barrier(conf_t *conf, int force) { @@ -1114,24 +1118,30 @@ static int raid10_spare_active(mddev_t *mddev) static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ - return 0; + return -EBUSY; if (!enough(conf)) - return 0; + return -EINVAL; + + if (rdev->raid_disk) + first = last = rdev->raid_disk; if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else - mirror = 0; - for ( ; mirror < mddev->raid_disks; mirror++) + mirror = first; + for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@ -1146,7 +1156,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); @@ -1154,7 +1164,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) } print_conf(conf); - return found; + return err; } static int raid10_remove_disk(mddev_t *mddev, int number) @@ -2159,7 +2169,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_size = size << (conf->chunk_shift-1); + mddev->array_sectors = size << conf->chunk_shift; mddev->resync_max_sectors = size << conf->chunk_shift; mddev->queue->unplug_fn = raid10_unplug; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9ce7154845c..224de022e7c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } static void print_raid5_conf (raid5_conf_t *conf); +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { @@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { - BUG_ON(sh->ops.pending); + BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + BUG_ON(stripe_operations_active(sh)); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", @@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } -/* test_and_ack_op() ensures that we only dequeue an operation once */ -#define test_and_ack_op(op, pend) \ -do { \ - if (test_bit(op, &sh->ops.pending) && \ - !test_bit(op, &sh->ops.complete)) { \ - if (test_and_set_bit(op, &sh->ops.ack)) \ - clear_bit(op, &pend); \ - else \ - ack++; \ - } else \ - clear_bit(op, &pend); \ -} while (0) - -/* find new work to run, do not resubmit work that is already - * in flight - */ -static unsigned long get_stripe_work(struct stripe_head *sh) -{ - unsigned long pending; - int ack = 0; - - pending = sh->ops.pending; - - test_and_ack_op(STRIPE_OP_BIOFILL, pending); - test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); - test_and_ack_op(STRIPE_OP_PREXOR, pending); - test_and_ack_op(STRIPE_OP_BIODRAIN, pending); - test_and_ack_op(STRIPE_OP_POSTXOR, pending); - test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; - - sh->ops.count -= ack; - if (unlikely(sh->ops.count < 0)) { - printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " - "ops.complete: %#lx\n", pending, sh->ops.pending, - sh->ops.ack, sh->ops.complete); - BUG(); - } - - return pending; -} - static void raid5_end_read_request(struct bio *bi, int error); static void raid5_end_write_request(struct bio *bi, int error); -static void ops_run_io(struct stripe_head *sh) +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); - set_bit(STRIPE_IO_STARTED, &sh->state); for (i = disks; i--; ) { int rw; struct bio *bi; @@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(STRIPE_SYNCING, &sh->state) || - test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || - test_bit(STRIPE_EXPAND_READY, &sh->state)) + if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, @@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until - * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) + * !STRIPE_BIOFILL_RUN */ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2; - /* The access to dev->read is outside of the - * spin_lock_irq(&conf->device_lock), but is protected - * by the STRIPE_OP_BIOFILL pending bit - */ BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); if (--rbi->bi_phys_segments == 0) { rbi->bi_next = return_bi; return_bi = rbi; } - spin_unlock_irq(&conf->device_lock); rbi = rbi2; } } } - set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, unsigned long pending) +static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute5, sh); - /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) - async_tx_ack(tx); - return tx; } @@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); } static struct dma_async_tx_descriptor * @@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - int pd_idx = sh->pd_idx, i; - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (Wantprexor) - */ - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct bio *chosen; - int towrite; - - towrite = 0; - if (prexor) { /* rmw */ - if (dev->towrite && - test_bit(R5_Wantprexor, &dev->flags)) - towrite = 1; - } else { /* rcw */ - if (i != pd_idx && dev->towrite && - test_bit(R5_LOCKED, &dev->flags)) - towrite = 1; - } - if (towrite) { + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; spin_lock(&sh->lock); @@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_complete_write(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, @@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); } - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = 0; unsigned long flags; - dma_async_tx_callback callback; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (prexor) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, } } - /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? - ops_complete_write : ops_complete_postxor; - /* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST @@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, if (unlikely(count == 1)) { flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) - tx = ops_run_compute5(sh, pending); + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + async_tx_ack(tx); + } - if (test_bit(STRIPE_OP_PREXOR, &pending)) + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx, pending); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx, pending); + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } -static int -handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +static void +schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - int locked = 0; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) * stripe cache */ if (!expand) { - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - sh->ops.count++; - } + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - sh->ops.count++; + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->towrite) { set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } - if (locked + 1 == disks) + if (s->locked + 1 == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - - sh->ops.count += 3; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) continue; - /* For a read-modify write there may be blocks that are - * locked for reading while others are ready to be - * written so we distinguish these blocks by the - * R5_Wantprexor bit - */ if (dev->towrite && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantprexor, &dev->flags); + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } } @@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - locked++; + s->locked++; - pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, - locked, sh->ops.pending); - - return locked; + s->locked, s->ops_request); } /* @@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) } static void -handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, +handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks - * to process +/* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue */ -static int __handle_issuing_new_read_requests5(struct stripe_head *sh, - struct stripe_head_state *s, int disk_idx, int disks) +static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; - /* don't schedule compute operations or reads on the parity block while - * a check is in flight - */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - return ~0; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || (s->failed && - (failed_dev->toread || (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags) - ))))) { - /* 1/ We would like to get this block, possibly by computing it, - * but we might not be able to. - * - * 2/ Since parity check operations potentially make the parity - * block !uptodate it will need to be refreshed before any - * compute operations on data disks are scheduled. - * - * 3/ We hold off parity block re-reads until check operations - * have quiesced. + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync */ if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + (s->failed && disk_idx == s->failed_num)) { + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; - sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will @@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * subsequent operation. */ s->uptodate++; - return 0; /* uptodate + compute == disks */ + return 1; /* uptodate + compute == disks */ } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); } } - return ~0; + return 0; } -static void handle_issuing_new_read_requests5(struct stripe_head *sh, +/** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ - if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) for (i = disks; i--; ) - if (__handle_issuing_new_read_requests5( - sh, s, i, disks) == 0) + if (fetch_block5(sh, s, i, disks)) break; - } set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_issuing_new_read_requests6(struct stripe_head *sh, +static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, } -/* handle_completed_write_requests +/* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ -static void handle_completed_write_requests(raid5_conf_t *conf, +static void handle_stripe_clean_event(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; @@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, md_wakeup_thread(conf->mddev->thread); } -static void handle_issuing_new_write_requests5(raid5_conf_t *conf, +static void handle_stripe_dirtying5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; @@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ - if ((s->req_compute || - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) - s->locked += handle_write_operations5(sh, rcw == 0, 0); + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction5(sh, s, rcw == 0, 0); } -static void handle_issuing_new_write_requests6(raid5_conf_t *conf, +static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { - int canceled_check = 0; + struct r5dev *dev = NULL; set_bit(STRIPE_HANDLE, &sh->state); - /* complete a check operation */ - if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ if (s->failed == 0) { - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } - } else - canceled_check = 1; /* STRIPE_INSYNC is not set */ - } - - /* start a new check operation if there are no failures, the stripe is - * not insync, and a repair is not in flight - */ - if (s->failed == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; + break; } - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - + dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - /* Wait for check parity and compute block operations to complete - * before write-back. If a failure occurred while the check operation - * was in flight we need to cycle this stripe through handle_stripe - * since the parity block may not be uptodate - */ - if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; - dev = &sh->dev[s->failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2634,22 +2507,21 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, * */ -static void handle_stripe5(struct stripe_head *sh) +static bool handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; - unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; int prexor; memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " - "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, - sh->ops.pending, sh->ops.ack, sh->ops.complete); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); - } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) /* maybe we can request a biofill operation * * new wantfill requests are only permitted while - * STRIPE_OP_BIOFILL is clear + * ops_complete_biofill is guaranteed to be inactive */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ @@ -2703,10 +2568,10 @@ static void handle_stripe5(struct stripe_head *sh) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); - break; } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ @@ -2723,12 +2588,20 @@ static void handle_stripe5(struct stripe_head *sh) rcu_read_unlock(); if (unlikely(blocked_rdev)) { - set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; } - if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) - sh->ops.count++; + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", @@ -2738,8 +2611,7 @@ static void handle_stripe5(struct stripe_head *sh) * need to be failed */ if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -2755,48 +2627,25 @@ static void handle_stripe5(struct stripe_head *sh) !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || - test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) - handle_issuing_new_read_requests5(sh, &s, disks); + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill5(sh, &s, disks); /* Now we check to see if any write operations have recently * completed */ - - /* leave prexor set until postxor is done, allows us to distinguish - * a rmw from a rcw during biodrain - */ prexor = 0; - if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) prexor = 1; - clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - - for (i = disks; i--; ) - clear_bit(R5_Wantprexor, &sh->dev[i].flags); - } - - /* if only POSTXOR is set then this is an 'expand' postxor */ - if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; /* All the 'written' buffers and the parity block are ready to * be written back to disk @@ -2808,9 +2657,6 @@ static void handle_stripe5(struct stripe_head *sh) (i == sh->pd_idx || dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; if (prexor) continue; if (!test_bit(R5_Insync, &dev->flags) || @@ -2832,20 +2678,18 @@ static void handle_stripe5(struct stripe_head *sh) * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ - if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - handle_issuing_new_write_requests5(conf, sh, &s, disks); + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough * data is available. The parity check is held off while parity * dependent operations are in flight. */ - if ((s.syncing && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_INSYNC, &sh->state)) || - test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -2864,52 +2708,36 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); s.locked++; } } - /* Finish postxor operations initiated by the expansion - * process - */ - if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && - !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { - + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); - set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); s.locked++; - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; } } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - s.locked += handle_write_operations5(sh, 1, 1); - } else if (s.expanded && - s.locked == 0 && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + schedule_reconstruction5(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2917,12 +2745,9 @@ static void handle_stripe5(struct stripe_head *sh) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, NULL); - if (sh->ops.count) - pending = get_stripe_work(sh); - unlock: spin_unlock(&sh->lock); @@ -2930,14 +2755,17 @@ static void handle_stripe5(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (pending) - raid5_run_ops(sh, pending); + if (s.ops_request) + raid5_run_ops(sh, s.ops_request); + + ops_run_io(sh, &s); return_io(return_bi); + return blocked_rdev == NULL; } -static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) +static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; int disks = sh->disks; @@ -3010,10 +2838,10 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); - break; } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ @@ -3031,9 +2859,16 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) rcu_read_unlock(); if (unlikely(blocked_rdev)) { - set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; } + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -3042,8 +2877,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * might need to be failed */ if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 2 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -3068,7 +2902,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -3076,11 +2910,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || (s.syncing && (s.uptodate < disks)) || s.expanding) - handle_issuing_new_read_requests6(sh, &s, &r6s, disks); + handle_stripe_fill6(sh, &s, &r6s, disks); /* now to consider writing and what else, if anything should be read */ if (s.to_write) - handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); + handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -3136,7 +2970,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, &r6s); unlock: @@ -3146,76 +2980,20 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - return_io(return_bi); - - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - set_bit(STRIPE_IO_STARTED, &sh->state); - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); + ops_run_io(sh, &s); - if (rdev) { - if (s.syncing || s.expanding || s.expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + return_io(return_bi); - bi->bi_bdev = rdev->bdev; - pr_debug("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } + return blocked_rdev == NULL; } -static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) +/* returns true if the stripe was handled */ +static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) { if (sh->raid_conf->level == 6) - handle_stripe6(sh, tmp_page); + return handle_stripe6(sh, tmp_page); else - handle_stripe5(sh); + return handle_stripe5(sh); } @@ -3697,9 +3475,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); } return 0; } @@ -3785,7 +3561,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped j == raid6_next_disk(sh->pd_idx, sh->disks)) continue; s = compute_blocknr(sh, j); - if (s < (mddev->array_size<<1)) { + if (s < mddev->array_sectors) { skipped = 1; continue; } @@ -3935,7 +3711,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - handle_stripe(sh, NULL); + /* wait for any blocked device to be handled */ + while(unlikely(!handle_stripe(sh, NULL))) + ; release_stripe(sh); return STRIPE_SECTORS; @@ -4002,12 +3780,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) spin_lock_irq(&conf->device_lock); remaining = --raid_bio->bi_phys_segments; spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - - raid_bio->bi_end_io(raid_bio, - test_bit(BIO_UPTODATE, &raid_bio->bi_flags) - ? 0 : -EIO); - } + if (remaining == 0) + bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@ -4058,10 +3832,8 @@ static void raid5d(mddev_t *mddev) sh = __get_priority_stripe(conf); - if (!sh) { - async_tx_issue_pending_all(); + if (!sh) break; - } spin_unlock_irq(&conf->device_lock); handled++; @@ -4074,6 +3846,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + async_tx_issue_pending_all(); unplug_slaves(mddev); pr_debug("--- raid5d inactive\n"); @@ -4094,6 +3867,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; if (!conf) @@ -4109,7 +3884,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; @@ -4434,7 +4211,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_size = mddev->size * (conf->previous_raid_disks - + mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4609,35 +4386,41 @@ abort: static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { raid5_conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int disk; struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; if (mddev->degraded > conf->max_degraded) /* no point adding a device */ - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; /* * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) disk = rdev->saved_raid_disk; else - disk = 0; - for ( ; disk < conf->raid_disks; disk++) + disk = first; + for ( ; disk <= last ; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - found = 1; + err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); - return found; + return err; } static int raid5_resize(mddev_t *mddev, sector_t sectors) @@ -4652,8 +4435,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_conf_t *conf = mddev_to_conf(mddev); sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors * (mddev->raid_disks + - conf->max_degraded); + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; @@ -4675,6 +4459,9 @@ static int raid5_check_reshape(mddev_t *mddev) return -EINVAL; /* Cannot shrink array or change level yet */ if (mddev->delta_disks == 0) return 0; /* nothing to do */ + if (mddev->bitmap) + /* Cannot grow a bitmap yet */ + return -EBUSY; /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress @@ -4738,7 +4525,7 @@ static int raid5_start_reshape(mddev_t *mddev) rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev)) { + if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; set_bit(In_sync, &rdev->flags); added_devices++; @@ -4786,15 +4573,16 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * + conf->mddev->array_sectors = 2 * conf->mddev->size * (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, + (loff_t)conf->mddev->array_sectors << 9); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } |