diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 744 |
1 files changed, 331 insertions, 413 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 378a25894c5..5d096096f95 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -52,23 +52,16 @@ #define NR_RAID1_BIOS 256 -static void unplug_slaves(mddev_t *mddev); - static void allow_barrier(conf_t *conf); static void lower_barrier(conf_t *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) { struct pool_info *pi = data; - r1bio_t *r1_bio; int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ - r1_bio = kzalloc(size, gfp_flags); - if (!r1_bio && pi->mddev) - unplug_slaves(pi->mddev); - - return r1_bio; + return kzalloc(size, gfp_flags); } static void r1bio_pool_free(void *r1_bio, void *data) @@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); - if (!r1_bio) { - unplug_slaves(pi->mddev); + if (!r1_bio) return NULL; - } /* * Allocate bios : 1 for reading, n-1 for writing */ for (j = pi->raid_disks ; j-- ; ) { - bio = bio_alloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r1_bio->bios[j] = bio; @@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error) rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } +static void r1_bio_write_done(r1bio_t *r1_bio) +{ + if (atomic_dec_and_test(&r1_bio->remaining)) + { + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = r1_bio->behind_page_count; + while (i--) + safe_put_page(r1_bio->behind_pages[i]); + kfree(r1_bio->behind_pages); + r1_bio->behind_pages = NULL; + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); + } +} + static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -373,21 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error) * Let's see if all mirrored write operations have finished * already. */ - if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); - } + r1_bio_write_done(r1_bio); if (to_put) bio_put(to_put); @@ -411,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error) static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const sector_t this_sector = r1_bio->sector; - int new_disk = conf->last_used, disk = new_disk; - int wonly_disk = -1; const int sectors = r1_bio->sectors; - sector_t new_distance, current_distance; + int start_disk; + int best_disk; + int i; + sector_t best_dist; mdk_rdev_t *rdev; + int choose_first; rcu_read_lock(); /* @@ -424,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) * We take the first readable disk when above the resync window. */ retry: + best_disk = -1; + best_dist = MaxSector; if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { - /* Choose the first operational device, for consistancy */ - new_disk = 0; - - for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - r1_bio->bios[new_disk] == IO_BLOCKED || - !rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(WriteMostly, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { - - if (rdev && test_bit(In_sync, &rdev->flags) && - r1_bio->bios[new_disk] != IO_BLOCKED) - wonly_disk = new_disk; - - if (new_disk == conf->raid_disks - 1) { - new_disk = wonly_disk; - break; - } - } - goto rb_out; - } - - - /* make sure the disk is operational */ - for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - r1_bio->bios[new_disk] == IO_BLOCKED || - !rdev || !test_bit(In_sync, &rdev->flags) || - test_bit(WriteMostly, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { - - if (rdev && test_bit(In_sync, &rdev->flags) && - r1_bio->bios[new_disk] != IO_BLOCKED) - wonly_disk = new_disk; - - if (new_disk <= 0) - new_disk = conf->raid_disks; - new_disk--; - if (new_disk == disk) { - new_disk = wonly_disk; - break; - } + choose_first = 1; + start_disk = 0; + } else { + choose_first = 0; + start_disk = conf->last_used; } - if (new_disk < 0) - goto rb_out; - - disk = new_disk; - /* now disk == new_disk == starting point for search */ - - /* - * Don't change to another disk for sequential reads: - */ - if (conf->next_seq_sect == this_sector) - goto rb_out; - if (this_sector == conf->mirrors[new_disk].head_position) - goto rb_out; - - current_distance = abs(this_sector - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest */ - - do { - if (disk <= 0) - disk = conf->raid_disks; - disk--; + for (i = 0 ; i < conf->raid_disks ; i++) { + sector_t dist; + int disk = start_disk + i; + if (disk >= conf->raid_disks) + disk -= conf->raid_disks; rdev = rcu_dereference(conf->mirrors[disk].rdev); - - if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags) || - test_bit(WriteMostly, &rdev->flags)) + if (r1_bio->bios[disk] == IO_BLOCKED + || rdev == NULL + || test_bit(Faulty, &rdev->flags)) continue; - - if (!atomic_read(&rdev->nr_pending)) { - new_disk = disk; + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < this_sector + sectors) + continue; + if (test_bit(WriteMostly, &rdev->flags)) { + /* Don't balance among write-mostly, just + * use the first as a last resort */ + if (best_disk < 0) + best_disk = disk; + continue; + } + /* This is a reasonable device to use. It might + * even be best. + */ + dist = abs(this_sector - conf->mirrors[disk].head_position); + if (choose_first + /* Don't change to another disk for sequential reads */ + || conf->next_seq_sect == this_sector + || dist == 0 + /* If device is idle, use it */ + || atomic_read(&rdev->nr_pending) == 0) { + best_disk = disk; break; } - new_distance = abs(this_sector - conf->mirrors[disk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - new_disk = disk; + if (dist < best_dist) { + best_dist = dist; + best_disk = disk; } - } while (disk != conf->last_used); - - rb_out: - + } - if (new_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[new_disk].rdev); + if (best_disk >= 0) { + rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (!test_bit(In_sync, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags)) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ @@ -525,42 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) goto retry; } conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - } - rcu_read_unlock(); - - return new_disk; -} - -static void unplug_slaves(mddev_t *mddev) -{ - conf_t *conf = mddev->private; - int i; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } + conf->last_used = best_disk; } rcu_read_unlock(); -} -static void raid1_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - - unplug_slaves(mddev); - md_wakeup_thread(mddev->thread); + return best_disk; } static int raid1_congested(void *data, int bits) @@ -592,20 +526,16 @@ static int raid1_congested(void *data, int bits) } -static int flush_pending_writes(conf_t *conf) +static void flush_pending_writes(conf_t *conf) { /* Any writes that have been queued but are awaiting * bitmap updates get flushed here. - * We return 1 if any requests were actually submitted. */ - int rv = 0; - spin_lock_irq(&conf->device_lock); if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(conf->mddev->queue); spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to * disk before proceeding w/ I/O */ @@ -617,10 +547,8 @@ static int flush_pending_writes(conf_t *conf) generic_make_request(bio); bio = next; } - rv = 1; } else spin_unlock_irq(&conf->device_lock); - return rv; } /* Barriers.... @@ -652,17 +580,15 @@ static void raise_barrier(conf_t *conf) /* Wait until no block IO is waiting */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + conf->resync_lock, ); /* block any new IO from starting */ conf->barrier++; - /* No wait for all pending IO to complete */ + /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + conf->resync_lock, ); spin_unlock_irq(&conf->resync_lock); } @@ -684,7 +610,7 @@ static void wait_barrier(conf_t *conf) conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -720,8 +646,7 @@ static void freeze_array(conf_t *conf) wait_event_lock_irq(conf->wait_barrier, conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - ({ flush_pending_writes(conf); - raid1_unplug(conf->mddev->queue); })); + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -735,15 +660,16 @@ static void unfreeze_array(conf_t *conf) } -/* duplicate the data pages for behind I/O */ -static struct page **alloc_behind_pages(struct bio *bio) +/* duplicate the data pages for behind I/O + */ +static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) { int i; struct bio_vec *bvec; - struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), + struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), GFP_NOIO); if (unlikely(!pages)) - goto do_sync_io; + return; bio_for_each_segment(bvec, bio, i) { pages[i] = alloc_page(GFP_NOIO); @@ -754,16 +680,17 @@ static struct page **alloc_behind_pages(struct bio *bio) kunmap(pages[i]); kunmap(bvec->bv_page); } - - return pages; + r1_bio->behind_pages = pages; + r1_bio->behind_page_count = bio->bi_vcnt; + set_bit(R1BIO_BehindIO, &r1_bio->state); + return; do_sync_io: - if (pages) - for (i = 0; i < bio->bi_vcnt && pages[i]; i++) + for (i = 0; i < bio->bi_vcnt; i++) + if (pages[i]) put_page(pages[i]); kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); - return NULL; } static int make_request(mddev_t *mddev, struct bio * bio) @@ -775,12 +702,11 @@ static int make_request(mddev_t *mddev, struct bio * bio) int i, targets = 0, disks; struct bitmap *bitmap; unsigned long flags; - struct bio_list bl; - struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; + int plugged; /* * Register the new request and wait if the reconstruction @@ -851,7 +777,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) } r1_bio->read_disk = rdisk; - read_bio = bio_clone(bio, GFP_NOIO); + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); r1_bio->bios[rdisk] = read_bio; @@ -872,14 +798,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) * inc refcount on their rdev. Record them by setting * bios[x] to bio */ + plugged = mddev_check_plugged(mddev); + disks = conf->raid_disks; -#if 0 - { static int first=1; - if (first) printk("First Write sector %llu disks %d\n", - (unsigned long long)r1_bio->sector, disks); - first = 0; - } -#endif retry_write: blocked_rdev = NULL; rcu_read_lock(); @@ -933,20 +854,20 @@ static int make_request(mddev_t *mddev, struct bio * bio) if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait) && - (behind_pages = alloc_behind_pages(bio)) != NULL) - set_bit(R1BIO_BehindIO, &r1_bio->state); + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(bio, r1_bio); - atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); - bio_list_init(&bl); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) continue; - mbio = bio_clone(bio, GFP_NOIO); + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); r1_bio->bios[i] = mbio; mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; @@ -955,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; - if (behind_pages) { + if (r1_bio->behind_pages) { struct bio_vec *bvec; int j; @@ -963,39 +884,27 @@ static int make_request(mddev_t *mddev, struct bio * bio) * we clear any unused pointer in the io_vec, rather * than leave them unchanged. This is important * because when we come to free the pages, we won't - * know the originial bi_idx, so we just free + * know the original bi_idx, so we just free * them all */ __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = behind_pages[j]; + bvec->bv_page = r1_bio->behind_pages[j]; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } atomic_inc(&r1_bio->remaining); - - bio_list_add(&bl, mbio); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + spin_unlock_irqrestore(&conf->device_lock, flags); } - kfree(behind_pages); /* the behind pages are attached to the bios now */ + r1_bio_write_done(r1_bio); - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_merge(&conf->pending_bio_list, &bl); - bio_list_init(&bl); - - blk_plug_device(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - - /* In case raid1d snuck into freeze_array */ + /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (do_sync) + if (do_sync || !bitmap || !plugged) md_wakeup_thread(mddev->thread); -#if 0 - while ((bio = bio_list_pop(&bl)) != NULL) - generic_make_request(bio); -#endif return 0; } @@ -1053,8 +962,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1183,10 +1093,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } - /* Only remove non-faulty devices is recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && + !mddev->recovery_disabled && mddev->degraded < conf->raid_disks) { err = -EBUSY; goto abort; @@ -1199,7 +1110,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) p->rdev = rdev; goto abort; } - md_integrity_register(mddev); + err = md_integrity_register(mddev); } abort: @@ -1245,7 +1156,7 @@ static void end_sync_write(struct bio *bio, int error) break; } if (!uptodate) { - int sync_blocks = 0; + sector_t sync_blocks = 0; sector_t s = r1_bio->sector; long sectors_to_go = r1_bio->sectors; /* make sure these bits doesn't get cleared. */ @@ -1267,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error) } } -static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +static int fix_sync_read_error(r1bio_t *r1_bio) { + /* Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. + */ + mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; - int i; - int disks = conf->raid_disks; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; + struct bio *bio = r1_bio->bios[r1_bio->read_disk]; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + mdk_rdev_t *rdev; + int start; - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* We have read all readable devices. If we haven't - * got the block, then there is no hope left. - * If we have, then we want to do a comparison - * and skip the write if everything is the same. - * If any blocks failed to read, then we need to - * attempt an over-write - */ - int primary; - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) - md_error(mddev, conf->mirrors[i].rdev); + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false)) { + success = 1; + break; + } + } + d++; + if (d == conf->raid_disks) + d = 0; + } while (!success && d != r1_bio->read_disk); - md_done_sync(mddev, r1_bio->sectors, 1); + if (!success) { + char b[BDEVNAME_SIZE]; + /* Cannot read from anywhere, array is toast */ + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" + " for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev, b), + (unsigned long long)r1_bio->sector); + md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); - return; + return 0; } - for (primary=0; primary<mddev->raid_disks; primary++) - if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { - r1_bio->bios[primary]->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[primary].rdev, mddev); - break; - } - r1_bio->read_disk = primary; - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) { - int j; - int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); - struct bio *pbio = r1_bio->bios[primary]; - struct bio *sbio = r1_bio->bios[i]; - - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { - for (j = vcnt; j-- ; ) { - struct page *p, *s; - p = pbio->bi_io_vec[j].bv_page; - s = sbio->bi_io_vec[j].bv_page; - if (memcmp(page_address(p), - page_address(s), - PAGE_SIZE)) - break; - } - } else - j = 0; - if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { - sbio->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[i].rdev, mddev); - } else { - /* fixup the bio for reuse */ - int size; - sbio->bi_vcnt = vcnt; - sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; - sbio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - size = sbio->bi_size; - for (j = 0; j < vcnt ; j++) { - struct bio_vec *bi; - bi = &sbio->bi_io_vec[j]; - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } - } - } + start = d; + /* write it back and re-read */ + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + WRITE, false) == 0) { + r1_bio->bios[d]->bi_end_io = NULL; + rdev_dec_pending(rdev, mddev); + md_error(mddev, rdev); + } else + atomic_add(s, &rdev->corrected_errors); + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false) == 0) + md_error(mddev, rdev); + } + sectors -= s; + sect += s; + idx ++; } - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* ouch - failed to read all of that. - * Try some synchronous reads of other devices to get - * good data, much like with normal read errors. Only - * read into the pages we already have so we don't - * need to re-issue the read request. - * We don't need to freeze the array, because being in an - * active sync request, there is no normal IO, and - * no overlapping syncs. - */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - int idx = 0; - - while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - mdk_rdev_t *rdev; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - do { - if (r1_bio->bios[d]->bi_end_io == end_sync_read) { - /* No rcu protection needed here devices - * can only be removed when no resync is - * active, and resync is currently active - */ - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ)) { - success = 1; - break; - } - } - d++; - if (d == conf->raid_disks) - d = 0; - } while (!success && d != r1_bio->read_disk); - - if (success) { - int start = d; - /* write it back and re-read */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - bio->bi_io_vec[idx].bv_page, - WRITE) == 0) - md_error(mddev, rdev); - } - d = start; - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ) == 0) - md_error(mddev, rdev); - } - } else { - char b[BDEVNAME_SIZE]; - /* Cannot read from anywhere, array is toast */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" - " for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), - (unsigned long long)r1_bio->sector); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return; + set_bit(R1BIO_Uptodate, &r1_bio->state); + set_bit(BIO_UPTODATE, &bio->bi_flags); + return 1; +} + +static int process_checks(r1bio_t *r1_bio) +{ + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + mddev_t *mddev = r1_bio->mddev; + conf_t *conf = mddev->private; + int primary; + int i; + + for (primary = 0; primary < conf->raid_disks; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + for (i = 0; i < conf->raid_disks; i++) { + int j; + int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + int size; + + if (r1_bio->bios[i]->bi_end_io != end_sync_read) + continue; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + PAGE_SIZE)) + break; } - sectors -= s; - sect += s; - idx ++; + } else + j = 0; + if (j >= 0) + mddev->resync_mismatches += r1_bio->sectors; + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { + /* No need to write to this device. */ + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + continue; + } + /* fixup the bio for reuse */ + sbio->bi_vcnt = vcnt; + sbio->bi_size = r1_bio->sectors << 9; + sbio->bi_idx = 0; + sbio->bi_phys_segments = 0; + sbio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bi_next = NULL; + sbio->bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), + page_address(pbio->bi_io_vec[j].bv_page), + PAGE_SIZE); } } + return 0; +} +static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +{ + conf_t *conf = mddev->private; + int i; + int disks = conf->raid_disks; + struct bio *bio, *wbio; + + bio = r1_bio->bios[r1_bio->read_disk]; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + /* ouch - failed to read all of that. */ + if (!fix_sync_read_error(r1_bio)) + return; + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (process_checks(r1_bio) < 0) + return; /* * schedule writes */ @@ -1513,10 +1440,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; @@ -1539,9 +1464,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1556,9 +1480,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1586,15 +1509,17 @@ static void raid1d(mddev_t *mddev) unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; - int unplug=0; mdk_rdev_t *rdev; + struct blk_plug plug; md_check_recovery(mddev); - + + blk_start_plug(&plug); for (;;) { char b[BDEVNAME_SIZE]; - unplug += flush_pending_writes(conf); + if (atomic_read(&mddev->plug_cnt) == 0) + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -1608,10 +1533,9 @@ static void raid1d(mddev_t *mddev) mddev = r1_bio->mddev; conf = mddev->private; - if (test_bit(R1BIO_IsSync, &r1_bio->state)) { + if (test_bit(R1BIO_IsSync, &r1_bio->state)) sync_request_write(mddev, r1_bio); - unplug = 1; - } else { + else { int disk; /* we got a read error. Maybe the drive is bad. Maybe just @@ -1646,7 +1570,8 @@ static void raid1d(mddev_t *mddev) mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; bio_put(bio); - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); + bio = bio_clone_mddev(r1_bio->master_bio, + GFP_NOIO, mddev); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) @@ -1660,14 +1585,12 @@ static void raid1d(mddev_t *mddev) bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; - unplug = 1; generic_make_request(bio); } } cond_resched(); } - if (unplug) - unplug_slaves(mddev); + blk_finish_plug(&plug); } @@ -1705,7 +1628,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int i; int wonly = -1; int write_targets = 0, read_targets = 0; - int sync_blocks; + sector_t sync_blocks; int still_degraded = 0; if (!conf->r1buf_pool) @@ -1755,11 +1678,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i msleep_interruptible(1000); bitmap_cond_end_sync(mddev->bitmap, sector_nr); + r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf); conf->next_resync = sector_nr; - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); rcu_read_lock(); /* * If we get a correctably read error during resync or recovery, @@ -1971,7 +1894,6 @@ static conf_t *setup_conf(mddev_t *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - bio_list_init(&conf->flushing_bio_list); conf->last_used = -1; for (i = 0; i < conf->raid_disks; i++) { @@ -2049,7 +1971,6 @@ static int run(mddev_t *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - mddev->queue->queue_lock = &conf->device_lock; list_for_each_entry(rdev, &mddev->disks, same_set) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -2092,11 +2013,9 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); - mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; - md_integrity_register(mddev); - return 0; + return md_integrity_register(mddev); } static int stop(mddev_t *mddev) @@ -2118,7 +2037,6 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); @@ -2143,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && - mddev->recovery_cp == MaxSector) { + mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } |