diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 234 |
1 files changed, 203 insertions, 31 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 51d9645ed09..a93ca478142 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; - bio_endio(bio, bio->bi_size, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_sector, + (unsigned long long) bio->bi_sector + + (bio->bi_size >> 9) - 1); + + bio_endio(bio, bio->bi_size, + test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + } free_r1bio(r1_bio); } @@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - int mirror; + int mirror, behind; conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) @@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int update_head_pos(mirror, r1_bio); + behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* In behind mode, we ACK the master bio once the I/O has safely + * reached all non-writemostly disks. Setting the Returned bit + * ensures that this gets done only once -- we don't ever want to + * return -EIO here, instead we'll wait */ + + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, mbio->bi_size, 0); + } + } + } /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + __free_page(bio->bi_io_vec[i].bv_page); + } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state)); + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; + int wonly_disk = -1; const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; - mdk_rdev_t *new_rdev, *rdev; + mdk_rdev_t *rdev; rcu_read_lock(); /* - * Check if it if we can balance. We can balance on the whole + * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ @@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* Choose the first operation device, for consistancy */ new_disk = 0; - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { - new_disk++; - if (new_disk == conf->raid_disks) { - new_disk = -1; + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync + || test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[++new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + + if (new_disk == conf->raid_disks - 1) { + new_disk = wonly_disk; break; } } @@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { - new_disk = -1; - goto rb_out; + new_disk = wonly_disk; + break; } } + + if (new_disk < 0) + goto rb_out; + disk = new_disk; /* now disk == new_disk == starting point for search */ @@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) disk = conf->raid_disks; disk--; - if ((rdev=conf->mirrors[disk].rdev) == NULL || - !rdev->in_sync) + rdev = conf->mirrors[disk].rdev; + + if (!rdev || + !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { new_disk = disk; - new_rdev = rdev; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; - new_rdev = rdev; } } while (disk != conf->last_used); -rb_out: + rb_out: if (new_disk >= 0) { - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - atomic_inc(&new_rdev->nr_pending); - if (!new_rdev->in_sync) { + rdev = conf->mirrors[new_disk].rdev; + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (!rdev->in_sync) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ - atomic_dec(&new_rdev->nr_pending); + atomic_dec(&rdev->nr_pending); goto retry; } + conf->next_seq_sect = this_sector + sectors; + conf->last_used = new_disk; } rcu_read_unlock(); @@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) spin_unlock_irq(&conf->resync_lock); } +/* duplicate the data pages for behind I/O */ +static struct page **alloc_behind_pages(struct bio *bio) +{ + int i; + struct bio_vec *bvec; + struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), + GFP_NOIO); + if (unlikely(!pages)) + goto do_sync_io; + + memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); + + bio_for_each_segment(bvec, bio, i) { + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) + goto do_sync_io; + memcpy(kmap(pages[i]) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(pages[i]); + kunmap(bvec->bv_page); + } + + return pages; + +do_sync_io: + if (pages) + for (i = 0; i < bio->bi_vcnt && pages[i]; i++) + __free_page(pages[i]); + kfree(pages); + PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + return NULL; +} + static int make_request(request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; @@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio) struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; + struct page **behind_pages = NULL; + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } /* * Register the new request and wait if the reconstruction @@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; - r1_bio->state = 0; - if (bio_data_dir(bio) == READ) { /* * read balancing logic: @@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + BUG_ON(targets == 0); /* we never fail the last device */ + if (targets < conf->raid_disks) { /* array is degraded, we will not clear the bitmap * on I/O completion (see raid1_end_write_request) */ set_bit(R1BIO_Degraded, &r1_bio->state); } + /* do behind I/O ? */ + if (bitmap && + atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && + (behind_pages = alloc_behind_pages(bio)) != NULL) + set_bit(R1BIO_BehindIO, &r1_bio->state); + atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->behind_remaining, 0); bio_list_init(&bl); for (i = 0; i < disks; i++) { @@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_rw = WRITE; mbio->bi_private = r1_bio; + if (behind_pages) { + struct bio_vec *bvec; + int j; + + /* Yes, I really want the '__' version so that + * we clear any unused pointer in the io_vec, rather + * than leave them unchanged. This is important + * because when we come to free the pages, we won't + * know the originial bi_idx, so we just free + * them all + */ + __bio_for_each_segment(bvec, mbio, j, 0) + bvec->bv_page = behind_pages[j]; + if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + atomic_inc(&r1_bio->behind_remaining); + } + atomic_inc(&r1_bio->remaining); bio_list_add(&bl, mbio); } + kfree(behind_pages); /* the behind pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); bio_list_init(&bl); @@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t max_sector, nr_sectors; int disk; int i; + int wonly; int write_targets = 0; int sync_blocks; int still_degraded = 0; @@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ disk = conf->last_used; /* make sure disk is operational */ - + wonly = disk; while (conf->mirrors[disk].rdev == NULL || - !conf->mirrors[disk].rdev->in_sync) { + !conf->mirrors[disk].rdev->in_sync || + test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) + ) { + if (conf->mirrors[disk].rdev && + conf->mirrors[disk].rdev->in_sync) + wonly = disk; if (disk <= 0) disk = conf->raid_disks; disk--; - if (disk == conf->last_used) + if (disk == conf->last_used) { + disk = wonly; break; + } } conf->last_used = disk; atomic_inc(&conf->mirrors[disk].rdev->nr_pending); @@ -1439,6 +1570,17 @@ out: static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + struct bitmap *bitmap = mddev->bitmap; + int behind_wait = 0; + + /* wait for behind writes to complete */ + while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + behind_wait++; + printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); /* wait a second */ + /* need to kick something here to make sure I/O goes? */ + } md_unregister_thread(mddev->thread); mddev->thread = NULL; @@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) return 0; } +static void raid1_quiesce(mddev_t *mddev, int state) +{ + conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, + conf->resync_lock, raid1_unplug(mddev->queue)); + spin_unlock_irq(&conf->resync_lock); + break; + case 0: + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_resume); + wake_up(&conf->wait_idle); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} + static mdk_personality_t raid1_personality = { @@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality = .sync_request = sync_request, .resize = raid1_resize, .reshape = raid1_reshape, + .quiesce = raid1_quiesce, }; static int __init raid_init(void) |