diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 1 | ||||
-rw-r--r-- | drivers/md/linear.c | 10 | ||||
-rw-r--r-- | drivers/md/md.c | 87 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 10 | ||||
-rw-r--r-- | drivers/md/raid1.c | 29 | ||||
-rw-r--r-- | drivers/md/raid10.c | 31 | ||||
-rw-r--r-- | drivers/md/raid5.c | 78 |
9 files changed, 185 insertions, 81 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index c14dacdacfa..b26927ce889 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -203,17 +203,6 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) * bitmap file handling - read and write the bitmap file and its superblock */ -/* copy the pathname of a file to a buffer */ -char *file_path(struct file *file, char *buf, int count) -{ - if (!buf) - return NULL; - - buf = d_path(&file->f_path, buf, count); - - return IS_ERR(buf) ? NULL : buf; -} - /* * basic page I/O operations */ @@ -721,11 +710,13 @@ static void bitmap_file_kick(struct bitmap *bitmap) if (bitmap->file) { path = kmalloc(PAGE_SIZE, GFP_KERNEL); if (path) - ptr = file_path(bitmap->file, path, PAGE_SIZE); + ptr = d_path(&bitmap->file->f_path, path, + PAGE_SIZE); + printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), ptr ? ptr : ""); + bmname(bitmap), IS_ERR(ptr) ? "" : ptr); kfree(path); } else diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 835def11419..ab6a61db63c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -432,6 +432,7 @@ static int crypt_convert(struct crypt_config *cc, case 0: atomic_dec(&ctx->pending); ctx->sector++; + cond_resched(); continue; /* error */ diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 10748240cb2..6a866d7c8ae 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) /** * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int linear_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; dev_info_t *dev0; - unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); dev0 = which_dev(mddev, sector); maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); diff --git a/drivers/md/md.c b/drivers/md/md.c index 83eb78b0013..2580ac1b9b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -74,6 +74,8 @@ static DEFINE_SPINLOCK(pers_lock); static void md_print_devices(void); +static DECLARE_WAIT_QUEUE_HEAD(resync_wait); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* @@ -274,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->active, 1); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); + init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; new->resync_max = MaxSector; new->level = LEVEL_NONE; @@ -3013,6 +3016,36 @@ degraded_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t +sync_force_parallel_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->parallel_resync); +} + +static ssize_t +sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) +{ + long n; + + if (strict_strtol(buf, 10, &n)) + return -EINVAL; + + if (n != 0 && n != 1) + return -EINVAL; + + mddev->parallel_resync = n; + + if (mddev->sync_thread) + wake_up(&resync_wait); + + return len; +} + +/* force parallel resync, even with shared block devices */ +static struct md_sysfs_entry md_sync_force_parallel = +__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, + sync_force_parallel_show, sync_force_parallel_store); + +static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; @@ -3187,6 +3220,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_min.attr, &md_sync_max.attr, &md_sync_speed.attr, + &md_sync_force_parallel.attr, &md_sync_completed.attr, &md_max_sync.attr, &md_suspend_lo.attr, @@ -3691,6 +3725,8 @@ static int do_md_stop(mddev_t * mddev, int mode) module_put(mddev->pers->owner); mddev->pers = NULL; + /* tell userspace to handle 'inactive' */ + sysfs_notify(&mddev->kobj, NULL, "array_state"); set_capacity(disk, 0); mddev->changed = 1; @@ -3861,8 +3897,10 @@ static void autorun_devices(int part) md_probe(dev, NULL, NULL); mddev = mddev_find(dev); - if (!mddev) { - printk(KERN_ERR + if (!mddev || !mddev->gendisk) { + if (mddev) + mddev_put(mddev); + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); break; } @@ -3987,8 +4025,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) if (!buf) goto out; - ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); - if (!ptr) + ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + if (IS_ERR(ptr)) goto out; strcpy(file->pathname, ptr); @@ -5399,7 +5437,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -5435,8 +5473,11 @@ void md_write_start(mddev_t *mddev, struct bio *bi) md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); + sysfs_notify(&mddev->kobj, NULL, "array_state"); } - wait_event(mddev->sb_wait, mddev->flags==0); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } void md_write_end(mddev_t *mddev) @@ -5471,13 +5512,17 @@ void md_allow_write(mddev_t *mddev) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); + + sysfs_notify(&mddev->kobj, NULL, "array_state"); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } else spin_unlock_irq(&mddev->write_lock); } EXPORT_SYMBOL_GPL(md_allow_write); -static DECLARE_WAIT_QUEUE_HEAD(resync_wait); - #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) void md_do_sync(mddev_t *mddev) @@ -5541,8 +5586,9 @@ void md_do_sync(mddev_t *mddev) for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; - if (mddev2->curr_resync && - match_mddev_units(mddev,mddev2)) { + if (!mddev->parallel_resync + && mddev2->curr_resync + && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); if (mddev < mddev2 && mddev->curr_resync == 2) { /* arbitrarily yield */ @@ -5622,7 +5668,6 @@ void md_do_sync(mddev_t *mddev) window/2,(unsigned long long) max_sectors/2); atomic_set(&mddev->recovery_active, 0); - init_waitqueue_head(&mddev->recovery_wait); last_check = 0; if (j>2) { @@ -5647,7 +5692,7 @@ void md_do_sync(mddev_t *mddev) sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto out; } @@ -5670,8 +5715,7 @@ void md_do_sync(mddev_t *mddev) last_check = io_sectors; - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || - test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; repeat: @@ -5725,8 +5769,7 @@ void md_do_sync(mddev_t *mddev) /* tell personality that we are finished */ mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && + if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5795,7 +5838,10 @@ static int remove_and_add_spares(mddev_t *mddev) } if (mddev->degraded) { - rdev_for_each(rdev, rtmp, mddev) + rdev_for_each(rdev, rtmp, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags)) + spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; @@ -5813,6 +5859,7 @@ static int remove_and_add_spares(mddev_t *mddev) } else break; } + } } return spares; } @@ -5826,7 +5873,7 @@ static int remove_and_add_spares(mddev_t *mddev) * to do that as needed. * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in * "->recovery" and create a thread at ->sync_thread. - * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * When the thread finishes it sets MD_RECOVERY_DONE * and wakeups up this thread which will reap the thread and finish up. * This thread also removes any faulty devices (with nr_pending == 0). * @@ -5901,8 +5948,7 @@ void md_check_recovery(mddev_t *mddev) /* resync has finished, collect result */ md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* success...*/ /* activate any spares */ mddev->pers->spare_active(mddev); @@ -5926,7 +5972,6 @@ void md_check_recovery(mddev_t *mddev) * might be left set */ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - clear_bit(MD_RECOVERY_ERR, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4f4d1f38384..e968116e0de 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -327,7 +327,8 @@ static int multipath_remove_disk(mddev_t *mddev, int number) if (rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { - printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); + printk(KERN_ERR "hot-remove-disk, slot %d is identified" + " but is still operational!\n", number); err = -EBUSY; goto abort; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 914c04ddec7..bcbb82594a1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev) /** * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid0_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ac409b7d83f..c610b947218 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -773,7 +773,7 @@ static int make_request(struct request_queue *q, struct bio * bio) r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; @@ -802,6 +802,8 @@ static int make_request(struct request_queue *q, struct bio * bio) wait_barrier(conf); + bitmap = mddev->bitmap; + disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); @@ -1025,7 +1027,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1146,6 +1148,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1282,6 +1292,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev_dec_pending(conf->mirrors[i].rdev, mddev); } else { /* fixup the bio for reuse */ + int size; sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; @@ -1295,10 +1306,20 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - for (j = 0; j < vcnt ; j++) - memcpy(page_address(sbio->bi_io_vec[j].bv_page), + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), page_address(pbio->bi_io_vec[j].bv_page), PAGE_SIZE); + } } } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8536ede1e71..22bb2b1b886 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } @@ -1020,7 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1171,6 +1172,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove faulty devices in recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + enough(conf)) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1237,6 +1246,7 @@ static void end_sync_write(struct bio *bio, int error) if (!uptodate) md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); while (atomic_dec_and_test(&r10_bio->remaining)) { @@ -1844,7 +1854,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", mdname(mddev)); break; @@ -2127,6 +2138,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93fde48c0f4..9ce7154845c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -94,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1258,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT @@ -1992,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2006,12 +2017,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ s->uptodate++; return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) @@ -2077,7 +2083,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); @@ -2635,6 +2643,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2764,9 +2773,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2800,6 +2811,8 @@ static void handle_stripe5(struct stripe_head *sh) if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); @@ -2880,6 +2893,8 @@ static void handle_stripe5(struct stripe_head *sh) for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; } @@ -2893,6 +2908,7 @@ static void handle_stripe5(struct stripe_head *sh) conf->raid_disks); s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded && + s.locked == 0 && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -3298,15 +3314,17 @@ static int raid5_congested(void *data, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -4287,7 +4305,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* @@ -4564,6 +4584,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { |