diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 242 |
1 files changed, 177 insertions, 65 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d60412c7f99..4a6ca1cb2e7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -66,7 +66,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector); static void lower_barrier(struct r1conf *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) } #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; + sector_t start_next_window = r1_bio->start_next_window; + sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { unsigned long flags; @@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) bio->bi_phys_segments--; done = (bio->bi_phys_segments == 0); spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * make_request() might be waiting for + * bi_phys_segments to decrease + */ + wake_up(&conf->wait_barrier); } else done = 1; @@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf); + allow_barrier(conf, start_next_window, bi_sector); } } @@ -255,9 +265,8 @@ static void raid_end_bio_io(struct r1bio *r1_bio) if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { pr_debug("raid1: sync end %s on sectors %llu-%llu\n", (bio_data_dir(bio) == WRITE) ? "write" : "read", - (unsigned long long) bio->bi_sector, - (unsigned long long) bio->bi_sector + - bio_sectors(bio) - 1); + (unsigned long long) bio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(bio) - 1); call_bio_endio(r1_bio); } @@ -456,9 +465,8 @@ static void raid1_end_write_request(struct bio *bio, int error) struct bio *mbio = r1_bio->master_bio; pr_debug("raid1: behind end write sectors" " %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - bio_sectors(mbio) - 1); + (unsigned long long) mbio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(mbio) - 1); call_bio_endio(r1_bio); } } @@ -814,8 +822,6 @@ static void flush_pending_writes(struct r1conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 - static void raise_barrier(struct r1conf *conf) { spin_lock_irq(&conf->resync_lock); @@ -827,9 +833,19 @@ static void raise_barrier(struct r1conf *conf) /* block any new IO from starting */ conf->barrier++; - /* Now wait for all pending IO to complete */ + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while barrier >= RESYNC_DEPTH, meaning resync reach + * the max count which allowed. + * C: next_resync + RESYNC_SECTORS > start_next_window, meaning + * next resync will reach to the window which normal bios are + * handling. + */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !conf->array_frozen && + conf->barrier < RESYNC_DEPTH && + (conf->start_next_window >= + conf->next_resync + RESYNC_SECTORS), conf->resync_lock); spin_unlock_irq(&conf->resync_lock); @@ -845,10 +861,33 @@ static void lower_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r1conf *conf) +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) { + bool wait = false; + + if (conf->array_frozen || !bio) + wait = true; + else if (conf->barrier && bio_data_dir(bio) == WRITE) { + if (conf->next_resync < RESYNC_WINDOW_SECTORS) + wait = true; + else if ((conf->next_resync - RESYNC_WINDOW_SECTORS + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector)) + wait = false; + else + wait = true; + } + + return wait; +} + +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +{ + sector_t sector = 0; + spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { + if (need_to_wait_for_sync(conf, bio)) { conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -860,22 +899,66 @@ static void wait_barrier(struct r1conf *conf) * count down. */ wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && + !conf->array_frozen && + (!conf->barrier || + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && current->bio_list && - !bio_list_empty(current->bio_list)), + !bio_list_empty(current->bio_list))), conf->resync_lock); conf->nr_waiting--; } + + if (bio && bio_data_dir(bio) == WRITE) { + if (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector) { + if (conf->start_next_window == MaxSector) + conf->start_next_window = + conf->next_resync + + NEXT_NORMALIO_DISTANCE; + + if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) + <= bio->bi_iter.bi_sector) + conf->next_window_requests++; + else + conf->current_window_requests++; + sector = conf->start_next_window; + } + } + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); + return sector; } -static void allow_barrier(struct r1conf *conf) +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector) { unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; + if (start_next_window) { + if (start_next_window == conf->start_next_window) { + if (conf->start_next_window + NEXT_NORMALIO_DISTANCE + <= bi_sector) + conf->next_window_requests--; + else + conf->current_window_requests--; + } else + conf->current_window_requests--; + + if (!conf->current_window_requests) { + if (conf->next_window_requests) { + conf->current_window_requests = + conf->next_window_requests; + conf->next_window_requests = 0; + conf->start_next_window += + NEXT_NORMALIO_DISTANCE; + } else + conf->start_next_window = MaxSector; + } + } spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -884,8 +967,7 @@ static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+extra + * We wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for @@ -895,8 +977,7 @@ static void freeze_array(struct r1conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; + conf->array_frozen = 1; wait_event_lock_irq_cmd(conf->wait_barrier, conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, @@ -907,8 +988,7 @@ static void unfreeze_array(struct r1conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } @@ -945,7 +1025,8 @@ do_sync_io: if (bvecs[i].bv_page) put_page(bvecs[i].bv_page); kfree(bvecs); - pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + pr_debug("%dB behind alloc failed, doing sync I/O\n", + bio->bi_iter.bi_size); } struct raid1_plug_cb { @@ -1013,6 +1094,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) int first_clone; int sectors_handled; int max_sectors; + sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1024,7 +1106,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) if (bio_data_dir(bio) == WRITE && bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_sector < mddev->suspend_hi) { + bio->bi_iter.bi_sector < mddev->suspend_hi) { /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -1035,14 +1117,14 @@ static void make_request(struct mddev *mddev, struct bio * bio) prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_sector >= mddev->suspend_hi) + bio->bi_iter.bi_sector >= mddev->suspend_hi) break; schedule(); } finish_wait(&conf->wait_barrier, &w); } - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); bitmap = mddev->bitmap; @@ -1057,7 +1139,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) r1_bio->sectors = bio_sectors(bio); r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector; + r1_bio->sector = bio->bi_iter.bi_sector; /* We might need to issue multiple reads to different * devices if there are bad blocks around, so we keep @@ -1097,12 +1179,13 @@ read_again: r1_bio->read_disk = rdisk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r1_bio->bios[rdisk] = read_bio; - read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ | do_sync; @@ -1114,7 +1197,7 @@ read_again: */ sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_sector); + - bio->bi_iter.bi_sector); r1_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (bio->bi_phys_segments == 0) @@ -1135,7 +1218,8 @@ read_again: r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; + r1_bio->sector = bio->bi_iter.bi_sector + + sectors_handled; goto read_again; } else generic_make_request(read_bio); @@ -1163,6 +1247,7 @@ read_again: disks = conf->raid_disks * 2; retry_write: + r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1231,14 +1316,24 @@ read_again: if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; + sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf); + allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); + /* + * We must make sure the multi r1bios of bio have + * the same value of bi_phys_segments + */ + if (bio->bi_phys_segments && old && + old != start_next_window) + /* Wait for the former r1bio(s) to complete */ + wait_event(conf->wait_barrier, + bio->bi_phys_segments == 1); goto retry_write; } @@ -1254,7 +1349,7 @@ read_again: bio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); } - sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; + sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -1266,7 +1361,7 @@ read_again: continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); if (first_clone) { /* do behind I/O ? @@ -1300,7 +1395,7 @@ read_again: r1_bio->bios[i] = mbio; - mbio->bi_sector = (r1_bio->sector + + mbio->bi_iter.bi_sector = (r1_bio->sector + conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; @@ -1340,7 +1435,7 @@ read_again: r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; goto retry_write; } @@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + wait_barrier(conf, NULL); + allow_barrier(conf, 0, 0); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + + conf->next_resync = 0; + conf->start_next_window = MaxSector; } static int raid1_spare_active(struct mddev *mddev) @@ -1479,6 +1577,7 @@ static int raid1_spare_active(struct mddev *mddev) } } if (rdev + && rdev->recovery_offset == MaxSector && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { count++; @@ -1854,20 +1953,24 @@ static int process_checks(struct r1bio *r1_bio) for (i = 0; i < conf->raid_disks * 2; i++) { int j; int size; + int uptodate; struct bio *b = r1_bio->bios[i]; if (b->bi_end_io != end_sync_read) continue; - /* fixup the bio for reuse */ + /* fixup the bio for reuse, but preserve BIO_UPTODATE */ + uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); bio_reset(b); + if (!uptodate) + clear_bit(BIO_UPTODATE, &b->bi_flags); b->bi_vcnt = vcnt; - b->bi_size = r1_bio->sectors << 9; - b->bi_sector = r1_bio->sector + + b->bi_iter.bi_size = r1_bio->sectors << 9; + b->bi_iter.bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; b->bi_bdev = conf->mirrors[i].rdev->bdev; b->bi_end_io = end_sync_read; b->bi_private = r1_bio; - size = b->bi_size; + size = b->bi_iter.bi_size; for (j = 0; j < vcnt ; j++) { struct bio_vec *bi; bi = &b->bi_io_vec[j]; @@ -1891,11 +1994,14 @@ static int process_checks(struct r1bio *r1_bio) int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; + int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); if (sbio->bi_end_io != end_sync_read) continue; + /* Now we can 'fixup' the BIO_UPTODATE flag */ + set_bit(BIO_UPTODATE, &sbio->bi_flags); - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + if (uptodate) { for (j = vcnt; j-- ; ) { struct page *p, *s; p = pbio->bi_io_vec[j].bv_page; @@ -1910,7 +2016,7 @@ static int process_checks(struct r1bio *r1_bio) if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { + && uptodate)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); @@ -2122,11 +2228,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) } wbio->bi_rw = WRITE; - wbio->bi_sector = r1_bio->sector; - wbio->bi_size = r1_bio->sectors << 9; + wbio->bi_iter.bi_sector = r1_bio->sector; + wbio->bi_iter.bi_size = r1_bio->sectors << 9; - md_trim_bio(wbio, sector - r1_bio->sector, sectors); - wbio->bi_sector += rdev->data_offset; + bio_trim(wbio, sector - r1_bio->sector, sectors); + wbio->bi_iter.bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; if (submit_bio_wait(WRITE, wbio) == 0) /* failure! */ @@ -2240,7 +2346,8 @@ read_more: } r1_bio->read_disk = disk; bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); - md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; printk_ratelimited(KERN_ERR @@ -2249,7 +2356,7 @@ read_more: mdname(mddev), (unsigned long long)r1_bio->sector, bdevname(rdev->bdev, b)); - bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ | do_sync; @@ -2258,7 +2365,7 @@ read_more: /* Drat - have to split this up more */ struct bio *mbio = r1_bio->master_bio; int sectors_handled = (r1_bio->sector + max_sectors - - mbio->bi_sector); + - mbio->bi_iter.bi_sector); r1_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (mbio->bi_phys_segments == 0) @@ -2276,7 +2383,8 @@ read_more: r1_bio->state = 0; set_bit(R1BIO_ReadError, &r1_bio->state); r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_sector + sectors_handled; + r1_bio->sector = mbio->bi_iter.bi_sector + + sectors_handled; goto read_more; } else @@ -2500,7 +2608,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp } if (bio->bi_end_io) { atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_private = r1_bio; } @@ -2600,7 +2708,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp continue; /* remove last page from this bio */ bio->bi_vcnt--; - bio->bi_size -= len; + bio->bi_iter.bi_size -= len; bio->bi_flags &= ~(1<< BIO_SEG_VALID); } goto bio_full; @@ -2713,6 +2821,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + conf->start_next_window = MaxSector; + conf->current_window_requests = conf->next_window_requests = 0; + err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2870,8 +2981,8 @@ static int stop(struct mddev *mddev) atomic_read(&bitmap->behind_writes) == 0); } - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) @@ -3030,10 +3141,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) wake_up(&conf->wait_barrier); break; case 1: - raise_barrier(conf); + freeze_array(conf, 0); break; case 0: - lower_barrier(conf); + unfreeze_array(conf); break; } } @@ -3050,7 +3161,8 @@ static void *raid1_takeover(struct mddev *mddev) mddev->new_chunk_sectors = 0; conf = setup_conf(mddev); if (!IS_ERR(conf)) - conf->barrier = 1; + /* Array must appear to be quiesced */ + conf->array_frozen = 1; return conf; } return ERR_PTR(-EINVAL); |