diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-29 08:47:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-29 08:47:36 -0700 |
commit | d8d048f69a618c531575cb1f398a7186f0532ef2 (patch) | |
tree | cb8d6ee7a3cb2aea53e5e95de0ca689cc69411a2 /drivers | |
parent | 53113b06e48c6c38f7612c1f8043b8a0d2adf72b (diff) | |
parent | f3ac8bf7ce1c5abd763ea762e95d1cdcf7799372 (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md:
md: tidy up device searches in read_balance.
md/raid1: fix some typos in comments.
md/raid1: discard unused variable.
md: unplug writes to external bitmaps.
md: use separate bio pool for each md device.
md: change type of first arg to sync_page_io.
md/raid1: perform mem allocation before disabling writes during resync.
md: use bio_kmalloc rather than bio_alloc when failure is acceptable.
md: Fix possible deadlock with multiple mempool allocations.
md: fix and update workqueue usage
md: use sector_t in bitmap_get_counter
md: remove md_mutex locking.
md: Fix regression with raid1 arrays without persistent metadata.
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bitmap.c | 30 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 4 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 162 | ||||
-rw-r--r-- | drivers/md/md.h | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 224 | ||||
-rw-r--r-- | drivers/md/raid1.h | 2 | ||||
-rw-r--r-- | drivers/md/raid10.c | 42 | ||||
-rw-r--r-- | drivers/md/raid5.c | 6 |
9 files changed, 275 insertions, 205 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index e4fb58db545..5a1ffe3527a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -212,7 +212,7 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, target = rdev->sb_start + offset + index * (PAGE_SIZE/512); - if (sync_page_io(rdev->bdev, target, + if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), page, READ)) { page->index = index; @@ -343,7 +343,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); - submit_bh(WRITE, bh); + submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh); bh = bh->b_this_page; } @@ -1101,7 +1101,7 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) bitmap_checkfree(bitmap, page); } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, - sector_t offset, int *blocks, + sector_t offset, sector_t *blocks, int create); /* @@ -1115,7 +1115,7 @@ void bitmap_daemon_work(mddev_t *mddev) unsigned long j; unsigned long flags; struct page *page = NULL, *lastpage = NULL; - int blocks; + sector_t blocks; void *paddr; struct dm_dirty_log *log = mddev->bitmap_info.log; @@ -1258,7 +1258,7 @@ void bitmap_daemon_work(mddev_t *mddev) } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, - sector_t offset, int *blocks, + sector_t offset, sector_t *blocks, int create) __releases(bitmap->lock) __acquires(bitmap->lock) @@ -1316,7 +1316,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect } while (sectors) { - int blocks; + sector_t blocks; bitmap_counter_t *bmc; spin_lock_irq(&bitmap->lock); @@ -1381,7 +1381,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto success = 0; while (sectors) { - int blocks; + sector_t blocks; unsigned long flags; bitmap_counter_t *bmc; @@ -1423,7 +1423,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto } EXPORT_SYMBOL(bitmap_endwrite); -static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded) { bitmap_counter_t *bmc; @@ -1452,7 +1452,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *bloc return rv; } -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded) { /* bitmap_start_sync must always report on multiples of whole @@ -1463,7 +1463,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, * Return the 'or' of the result. */ int rv = 0; - int blocks1; + sector_t blocks1; *blocks = 0; while (*blocks < (PAGE_SIZE>>9)) { @@ -1476,7 +1476,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, } EXPORT_SYMBOL(bitmap_start_sync); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) { bitmap_counter_t *bmc; unsigned long flags; @@ -1515,7 +1515,7 @@ void bitmap_close_sync(struct bitmap *bitmap) * RESYNC bit wherever it is still on */ sector_t sector = 0; - int blocks; + sector_t blocks; if (!bitmap) return; while (sector < bitmap->mddev->resync_max_sectors) { @@ -1528,7 +1528,7 @@ EXPORT_SYMBOL(bitmap_close_sync); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) { sector_t s = 0; - int blocks; + sector_t blocks; if (!bitmap) return; @@ -1562,7 +1562,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n * be 0 at this point */ - int secs; + sector_t secs; bitmap_counter_t *bmc; spin_lock_irq(&bitmap->lock); bmc = bitmap_get_counter(bitmap, offset, &secs, 1); @@ -1790,7 +1790,7 @@ int bitmap_load(mddev_t *mddev) * All chunks should be clean, but some might need_sync. */ while (sector < mddev->resync_max_sectors) { - int blocks; + sector_t blocks; bitmap_start_sync(bitmap, sector, &blocks, 0); sector += blocks; } diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index e872a7bad6b..931a7a7c379 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -271,8 +271,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind); void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 1a898788461..339fdc67075 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -210,7 +210,7 @@ static int make_request(mddev_t *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone(bio, GFP_NOIO); + struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; b->bi_end_io = faulty_fail; diff --git a/drivers/md/md.c b/drivers/md/md.c index 225815197a3..4e957f3140a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -57,8 +57,6 @@ #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) -static DEFINE_MUTEX(md_mutex); - #ifndef MODULE static void autostart_arrays(int part); #endif @@ -69,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock); static void md_print_devices(void); static DECLARE_WAIT_QUEUE_HEAD(resync_wait); +static struct workqueue_struct *md_wq; +static struct workqueue_struct *md_misc_wq; #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } @@ -149,6 +149,72 @@ static const struct block_device_operations md_fops; static int start_readonly; +/* bio_clone_mddev + * like bio_clone, but with a local bio set + */ + +static void mddev_bio_destructor(struct bio *bio) +{ + mddev_t *mddev, **mddevp; + + mddevp = (void*)bio; + mddev = mddevp[-1]; + + bio_free(bio, mddev->bio_set); +} + +struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + mddev_t *mddev) +{ + struct bio *b; + mddev_t **mddevp; + + if (!mddev || !mddev->bio_set) + return bio_alloc(gfp_mask, nr_iovecs); + + b = bio_alloc_bioset(gfp_mask, nr_iovecs, + mddev->bio_set); + if (!b) + return NULL; + mddevp = (void*)b; + mddevp[-1] = mddev; + b->bi_destructor = mddev_bio_destructor; + return b; +} +EXPORT_SYMBOL_GPL(bio_alloc_mddev); + +struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + mddev_t *mddev) +{ + struct bio *b; + mddev_t **mddevp; + + if (!mddev || !mddev->bio_set) + return bio_clone(bio, gfp_mask); + + b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, + mddev->bio_set); + if (!b) + return NULL; + mddevp = (void*)b; + mddevp[-1] = mddev; + b->bi_destructor = mddev_bio_destructor; + __bio_clone(b, bio); + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); + + if (ret < 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL_GPL(bio_clone_mddev); + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -300,7 +366,7 @@ static void md_end_flush(struct bio *bio, int err) if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pre-request flush has finished */ - schedule_work(&mddev->flush_work); + queue_work(md_wq, &mddev->flush_work); } bio_put(bio); } @@ -321,7 +387,7 @@ static void submit_flushes(mddev_t *mddev) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc(GFP_KERNEL, 0); + bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; @@ -369,7 +435,7 @@ void md_flush_request(mddev_t *mddev, struct bio *bio) submit_flushes(mddev); if (atomic_dec_and_test(&mddev->flush_pending)) - schedule_work(&mddev->flush_work); + queue_work(md_wq, &mddev->flush_work); } EXPORT_SYMBOL(md_flush_request); @@ -428,6 +494,8 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(mddev_t *mddev) { + struct bio_set *bs = NULL; + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && @@ -435,19 +503,22 @@ static void mddev_put(mddev_t *mddev) /* Array is not configured at all, and not held active, * so destroy it */ list_del(&mddev->all_mddevs); + bs = mddev->bio_set; + mddev->bio_set = NULL; if (mddev->gendisk) { - /* we did a probe so need to clean up. - * Call schedule_work inside the spinlock - * so that flush_scheduled_work() after - * mddev_find will succeed in waiting for the - * work to be done. + /* We did a probe so need to clean up. Call + * queue_work inside the spinlock so that + * flush_workqueue() after mddev_find will + * succeed in waiting for the work to be done. */ INIT_WORK(&mddev->del_work, mddev_delayed_delete); - schedule_work(&mddev->del_work); + queue_work(md_misc_wq, &mddev->del_work); } else kfree(mddev); } spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); } void mddev_init(mddev_t *mddev) @@ -691,7 +762,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * if zero is reached. * If an error occurred, call md_error */ - struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; @@ -722,16 +793,16 @@ static void bi_complete(struct bio *bio, int error) complete((struct completion*)bio->bi_private); } -int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw) +int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw) { - struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; int ret; rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = bdev; + bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); init_completion(&event); @@ -757,7 +828,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; return 0; @@ -1850,7 +1921,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) synchronize_rcu(); INIT_WORK(&rdev->del_work, md_delayed_delete); kobject_get(&rdev->kobj); - schedule_work(&rdev->del_work); + queue_work(md_misc_wq, &rdev->del_work); } /* @@ -2108,6 +2179,8 @@ repeat: if (!mddev->persistent) { clear_bit(MD_CHANGE_CLEAN, &mddev->flags); clear_bit(MD_CHANGE_DEVS, &mddev->flags); + if (!mddev->external) + clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); return; } @@ -4192,10 +4265,10 @@ static int md_alloc(dev_t dev, char *name) shift = partitioned ? MdpMinorShift : 0; unit = MINOR(mddev->unit) >> shift; - /* wait for any previous instance if this device - * to be completed removed (mddev_delayed_delete). + /* wait for any previous instance of this device to be + * completely removed (mddev_delayed_delete). */ - flush_scheduled_work(); + flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); error = -EEXIST; @@ -4378,6 +4451,9 @@ int md_run(mddev_t *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } + if (mddev->bio_set == NULL) + mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); + spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -5885,16 +5961,14 @@ static int md_open(struct block_device *bdev, fmode_t mode) mddev_t *mddev = mddev_find(bdev->bd_dev); int err; - mutex_lock(&md_mutex); if (mddev->gendisk != bdev->bd_disk) { /* we are racing with mddev_put which is discarding this * bd_disk. */ mddev_put(mddev); /* Wait until bdev->bd_disk is definitely gone */ - flush_scheduled_work(); + flush_workqueue(md_misc_wq); /* Then retry the open from the top */ - mutex_unlock(&md_mutex); return -ERESTARTSYS; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -5908,7 +5982,6 @@ static int md_open(struct block_device *bdev, fmode_t mode) check_disk_size_change(mddev->gendisk, bdev); out: - mutex_unlock(&md_mutex); return err; } @@ -5917,10 +5990,8 @@ static int md_release(struct gendisk *disk, fmode_t mode) mddev_t *mddev = disk->private_data; BUG_ON(!mddev); - mutex_lock(&md_mutex); atomic_dec(&mddev->openers); mddev_put(mddev); - mutex_unlock(&md_mutex); return 0; } @@ -6052,7 +6123,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); if (mddev->event_work.func) - schedule_work(&mddev->event_work); + queue_work(md_misc_wq, &mddev->event_work); md_new_event_inintr(mddev); } @@ -7212,12 +7283,23 @@ static void md_geninit(void) static int __init md_init(void) { - if (register_blkdev(MD_MAJOR, "md")) - return -1; - if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MD_MAJOR, "md"); - return -1; - } + int ret = -ENOMEM; + + md_wq = alloc_workqueue("md", WQ_RESCUER, 0); + if (!md_wq) + goto err_wq; + + md_misc_wq = alloc_workqueue("md_misc", 0, 0); + if (!md_misc_wq) + goto err_misc_wq; + + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + goto err_md; + + if ((ret = register_blkdev(0, "mdp")) < 0) + goto err_mdp; + mdp_major = ret; + blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, @@ -7228,8 +7310,16 @@ static int __init md_init(void) md_geninit(); return 0; -} +err_mdp: + unregister_blkdev(MD_MAJOR, "md"); +err_md: + destroy_workqueue(md_misc_wq); +err_misc_wq: + destroy_workqueue(md_wq); +err_wq: + return ret; +} #ifndef MODULE @@ -7316,6 +7406,8 @@ static __exit void md_exit(void) export_array(mddev); mddev->hold_active = 0; } + destroy_workqueue(md_misc_wq); + destroy_workqueue(md_wq); } subsys_initcall(md_init); diff --git a/drivers/md/md.h b/drivers/md/md.h index 112a2c32db0..d05bab55df4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -331,6 +331,8 @@ struct mddev_s struct attribute_group *to_remove; struct plug_handle *plug; /* if used by personality */ + struct bio_set *bio_set; + /* Generic flush handling. * The last to finish preflush schedules a worker to submit * the rest of the request (without the REQ_FLUSH flag). @@ -495,7 +497,7 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, +extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, struct page *page, int rw); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); @@ -517,4 +519,8 @@ extern void md_rdev_init(mdk_rdev_t *rdev); extern void mddev_suspend(mddev_t *mddev); extern void mddev_resume(mddev_t *mddev); +extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + mddev_t *mddev); +extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 378a25894c5..45f8324196e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -100,7 +100,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate bios : 1 for reading, n-1 for writing */ for (j = pi->raid_disks ; j-- ; ) { - bio = bio_alloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r1_bio->bios[j] = bio; @@ -306,6 +306,28 @@ static void raid1_end_read_request(struct bio *bio, int error) rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } +static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, + int behind) +{ + if (atomic_dec_and_test(&r1_bio->remaining)) + { + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = vcnt; + while (i--) + safe_put_page(bv[i].bv_page); + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); + } +} + static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -373,21 +395,7 @@ static void raid1_end_write_request(struct bio *bio, int error) * Let's see if all mirrored write operations have finished * already. */ - if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); - } + r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); if (to_put) bio_put(to_put); @@ -411,11 +419,13 @@ static void raid1_end_write_request(struct bio *bio, int error) static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const sector_t this_sector = r1_bio->sector; - int new_disk = conf->last_used, disk = new_disk; - int wonly_disk = -1; const int sectors = r1_bio->sectors; + int new_disk = -1; + int start_disk; + int i; sector_t new_distance, current_distance; mdk_rdev_t *rdev; + int choose_first; rcu_read_lock(); /* @@ -426,54 +436,33 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) retry: if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { - /* Choose the first operational device, for consistancy */ - new_disk = 0; - - for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - r1_bio->bios[new_disk] == IO_BLOCKED || - !rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(WriteMostly, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { - - if (rdev && test_bit(In_sync, &rdev->flags) && - r1_bio->bios[new_disk] != IO_BLOCKED) - wonly_disk = new_disk; - - if (new_disk == conf->raid_disks - 1) { - new_disk = wonly_disk; - break; - } - } - goto rb_out; + choose_first = 1; + start_disk = 0; + } else { + choose_first = 0; + start_disk = conf->last_used; } - /* make sure the disk is operational */ - for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - r1_bio->bios[new_disk] == IO_BLOCKED || - !rdev || !test_bit(In_sync, &rdev->flags) || - test_bit(WriteMostly, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { - - if (rdev && test_bit(In_sync, &rdev->flags) && - r1_bio->bios[new_disk] != IO_BLOCKED) - wonly_disk = new_disk; - - if (new_disk <= 0) - new_disk = conf->raid_disks; - new_disk--; - if (new_disk == disk) { - new_disk = wonly_disk; + for (i = 0 ; i < conf->raid_disks ; i++) { + int disk = start_disk + i; + if (disk >= conf->raid_disks) + disk -= conf->raid_disks; + + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (r1_bio->bios[disk] == IO_BLOCKED + || rdev == NULL + || !test_bit(In_sync, &rdev->flags)) + continue; + + new_disk = disk; + if (!test_bit(WriteMostly, &rdev->flags)) break; - } } - if (new_disk < 0) + if (new_disk < 0 || choose_first) goto rb_out; - disk = new_disk; - /* now disk == new_disk == starting point for search */ - /* * Don't change to another disk for sequential reads: */ @@ -482,20 +471,21 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) if (this_sector == conf->mirrors[new_disk].head_position) goto rb_out; - current_distance = abs(this_sector - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest */ + current_distance = abs(this_sector + - conf->mirrors[new_disk].head_position); - do { - if (disk <= 0) - disk = conf->raid_disks; - disk--; + /* look for a better disk - i.e. head is closer */ + start_disk = new_disk; + for (i = 1; i < conf->raid_disks; i++) { + int disk = start_disk + 1; + if (disk >= conf->raid_disks) + disk -= conf->raid_disks; rdev = rcu_dereference(conf->mirrors[disk].rdev); - - if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags) || - test_bit(WriteMostly, &rdev->flags)) + if (r1_bio->bios[disk] == IO_BLOCKED + || rdev == NULL + || !test_bit(In_sync, &rdev->flags) + || test_bit(WriteMostly, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { @@ -507,11 +497,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) current_distance = new_distance; new_disk = disk; } - } while (disk != conf->last_used); + } rb_out: - - if (new_disk >= 0) { rdev = rcu_dereference(conf->mirrors[new_disk].rdev); if (!rdev) @@ -658,7 +646,7 @@ static void raise_barrier(conf_t *conf) /* block any new IO from starting */ conf->barrier++; - /* No wait for all pending IO to complete */ + /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, conf->resync_lock, @@ -735,23 +723,26 @@ static void unfreeze_array(conf_t *conf) } -/* duplicate the data pages for behind I/O */ -static struct page **alloc_behind_pages(struct bio *bio) +/* duplicate the data pages for behind I/O + * We return a list of bio_vec rather than just page pointers + * as it makes freeing easier + */ +static struct bio_vec *alloc_behind_pages(struct bio *bio) { int i; struct bio_vec *bvec; - struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), + struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), GFP_NOIO); if (unlikely(!pages)) goto do_sync_io; bio_for_each_segment(bvec, bio, i) { - pages[i] = alloc_page(GFP_NOIO); - if (unlikely(!pages[i])) + pages[i].bv_page = alloc_page(GFP_NOIO); + if (unlikely(!pages[i].bv_page)) goto do_sync_io; - memcpy(kmap(pages[i]) + bvec->bv_offset, + memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(pages[i]); + kunmap(pages[i].bv_page); kunmap(bvec->bv_page); } @@ -759,8 +750,8 @@ static struct page **alloc_behind_pages(struct bio *bio) do_sync_io: if (pages) - for (i = 0; i < bio->bi_vcnt && pages[i]; i++) - put_page(pages[i]); + for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) + put_page(pages[i].bv_page); kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); return NULL; @@ -775,8 +766,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) int i, targets = 0, disks; struct bitmap *bitmap; unsigned long flags; - struct bio_list bl; - struct page **behind_pages = NULL; + struct bio_vec *behind_pages = NULL; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); @@ -851,7 +841,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) } r1_bio->read_disk = rdisk; - read_bio = bio_clone(bio, GFP_NOIO); + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); r1_bio->bios[rdisk] = read_bio; @@ -873,13 +863,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) * bios[x] to bio */ disks = conf->raid_disks; -#if 0 - { static int first=1; - if (first) printk("First Write sector %llu disks %d\n", - (unsigned long long)r1_bio->sector, disks); - first = 0; - } -#endif retry_write: blocked_rdev = NULL; rcu_read_lock(); @@ -937,16 +920,17 @@ static int make_request(mddev_t *mddev, struct bio * bio) (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); - atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); - bio_list_init(&bl); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) continue; - mbio = bio_clone(bio, GFP_NOIO); + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); r1_bio->bios[i] = mbio; mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; @@ -963,39 +947,29 @@ static int make_request(mddev_t *mddev, struct bio * bio) * we clear any unused pointer in the io_vec, rather * than leave them unchanged. This is important * because when we come to free the pages, we won't - * know the originial bi_idx, so we just free + * know the original bi_idx, so we just free * them all */ __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = behind_pages[j]; + bvec->bv_page = behind_pages[j].bv_page; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } atomic_inc(&r1_bio->remaining); - - bio_list_add(&bl, mbio); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + blk_plug_device(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); } + r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); kfree(behind_pages); /* the behind pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_merge(&conf->pending_bio_list, &bl); - bio_list_init(&bl); - - blk_plug_device(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - - /* In case raid1d snuck into freeze_array */ + /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); if (do_sync) md_wakeup_thread(mddev->thread); -#if 0 - while ((bio = bio_list_pop(&bl)) != NULL) - generic_make_request(bio); -#endif return 0; } @@ -1183,7 +1157,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } - /* Only remove non-faulty devices is recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && @@ -1245,7 +1219,7 @@ static void end_sync_write(struct bio *bio, int error) break; } if (!uptodate) { - int sync_blocks = 0; + sector_t sync_blocks = 0; sector_t s = r1_bio->sector; long sectors_to_go = r1_bio->sectors; /* make sure these bits doesn't get cleared. */ @@ -1388,7 +1362,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * active, and resync is currently active */ rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, sect + rdev->data_offset, s<<9, bio->bi_io_vec[idx].bv_page, @@ -1414,7 +1388,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) continue; rdev = conf->mirrors[d].rdev; atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, sect + rdev->data_offset, s<<9, bio->bi_io_vec[idx].bv_page, @@ -1429,7 +1403,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, sect + rdev->data_offset, s<<9, bio->bi_io_vec[idx].bv_page, @@ -1513,7 +1487,7 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, + sync_page_io(rdev, sect + rdev->data_offset, s<<9, conf->tmppage, READ)) @@ -1539,7 +1513,7 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, sect + rdev->data_offset, s<<9, conf->tmppage, WRITE) == 0) @@ -1556,7 +1530,7 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, sect + rdev->data_offset, s<<9, conf->tmppage, READ) == 0) @@ -1646,7 +1620,8 @@ static void raid1d(mddev_t *mddev) mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; bio_put(bio); - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); + bio = bio_clone_mddev(r1_bio->master_bio, + GFP_NOIO, mddev); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) @@ -1705,7 +1680,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int i; int wonly = -1; int write_targets = 0, read_targets = 0; - int sync_blocks; + sector_t sync_blocks; int still_degraded = 0; if (!conf->r1buf_pool) @@ -1755,11 +1730,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i msleep_interruptible(1000); bitmap_cond_end_sync(mddev->bitmap, sector_nr); + r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf); conf->next_resync = sector_nr; - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); rcu_read_lock(); /* * If we get a correctably read error during resync or recovery, @@ -1971,7 +1946,6 @@ static conf_t *setup_conf(mddev_t *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - bio_list_init(&conf->flushing_bio_list); conf->last_used = -1; for (i = 0; i < conf->raid_disks; i++) { diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index adf8cfd7331..cbfdf1a6acd 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -35,8 +35,6 @@ struct r1_private_data_s { struct list_head retry_list; /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - /* queue of writes that have been unplugged */ - struct bio_list flushing_bio_list; /* for use when syncing mirrors: */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f0d082f749b..c67aa54694a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -120,7 +120,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate bios. */ for (j = nalloc ; j-- ; ) { - bio = bio_alloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r10_bio->devs[j].bio = bio; @@ -801,7 +801,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); - struct bio_list bl; unsigned long flags; mdk_rdev_t *blocked_rdev; @@ -890,7 +889,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) } mirror = conf->mirrors + disk; - read_bio = bio_clone(bio, GFP_NOIO); + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); r10_bio->devs[slot].bio = read_bio; @@ -950,16 +949,16 @@ static int make_request(mddev_t *mddev, struct bio * bio) goto retry_write; } - atomic_set(&r10_bio->remaining, 0); + atomic_set(&r10_bio->remaining, 1); + bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); - bio_list_init(&bl); for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; if (!r10_bio->devs[i].bio) continue; - mbio = bio_clone(bio, GFP_NOIO); + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); r10_bio->devs[i].bio = mbio; mbio->bi_sector = r10_bio->devs[i].addr+ @@ -970,22 +969,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); - bio_list_add(&bl, mbio); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + blk_plug_device(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); } - if (unlikely(!atomic_read(&r10_bio->remaining))) { - /* the array is dead */ + if (atomic_dec_and_test(&r10_bio->remaining)) { + /* This matches the end of raid10_end_write_request() */ + bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); md_write_end(mddev); raid_end_bio_io(r10_bio); - return 0; } - bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_merge(&conf->pending_bio_list, &bl); - blk_plug_device(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); @@ -1558,7 +1557,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) test_bit(In_sync, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - success = sync_page_io(rdev->bdev, + success = sync_page_io(rdev, r10_bio->devs[sl].addr + sect + rdev->data_offset, s<<9, @@ -1597,7 +1596,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, r10_bio->devs[sl].addr + sect + rdev->data_offset, s<<9, conf->tmppage, WRITE) @@ -1634,7 +1633,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) char b[BDEVNAME_SIZE]; atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - if (sync_page_io(rdev->bdev, + if (sync_page_io(rdev, r10_bio->devs[sl].addr + sect + rdev->data_offset, s<<9, conf->tmppage, @@ -1747,7 +1746,8 @@ static void raid10d(mddev_t *mddev) mdname(mddev), bdevname(rdev->bdev,b), (unsigned long long)r10_bio->sector); - bio = bio_clone(r10_bio->master_bio, GFP_NOIO); + bio = bio_clone_mddev(r10_bio->master_bio, + GFP_NOIO, mddev); r10_bio->devs[r10_bio->read_slot].bio = bio; bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; @@ -1820,7 +1820,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int disk; int i; int max_sync; - int sync_blocks; + sector_t sync_blocks; sector_t sectors_skipped = 0; int chunks_skipped = 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 31140d1259d..dc574f303f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3876,9 +3876,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone_mddev to make a copy of the bio */ - align_bi = bio_clone(raid_bio, GFP_NOIO); + align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); if (!align_bi) return 0; /* @@ -4360,7 +4360,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t max_sector = mddev->dev_sectors; - int sync_blocks; + sector_t sync_blocks; int still_degraded = 0; int i; |