summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c744
1 files changed, 331 insertions, 413 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 378a25894c5..5d096096f95 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
#define NR_RAID1_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
-
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
- r1bio_t *r1_bio;
int size = offsetof(r1bio_t, bios[pi->raid_disks]);
/* allocate a r1bio with room for raid_disks entries in the bios array */
- r1_bio = kzalloc(size, gfp_flags);
- if (!r1_bio && pi->mddev)
- unplug_slaves(pi->mddev);
-
- return r1_bio;
+ return kzalloc(size, gfp_flags);
}
static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
- if (!r1_bio) {
- unplug_slaves(pi->mddev);
+ if (!r1_bio)
return NULL;
- }
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
for (j = pi->raid_disks ; j-- ; ) {
- bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
if (!bio)
goto out_free_bio;
r1_bio->bios[j] = bio;
@@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
+static void r1_bio_write_done(r1bio_t *r1_bio)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining))
+ {
+ /* it really is the end of this request */
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_pages[i]);
+ kfree(r1_bio->behind_pages);
+ r1_bio->behind_pages = NULL;
+ }
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(r1_bio->mddev);
+ raid_end_bio_io(r1_bio);
+ }
+}
+
static void raid1_end_write_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -373,21 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
* Let's see if all mirrored write operations have finished
* already.
*/
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = bio->bi_vcnt;
- while (i--)
- safe_put_page(bio->bi_io_vec[i].bv_page);
- }
- /* clear the bitmap if all writes complete successfully */
- bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
- r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- behind);
- md_write_end(r1_bio->mddev);
- raid_end_bio_io(r1_bio);
- }
+ r1_bio_write_done(r1_bio);
if (to_put)
bio_put(to_put);
@@ -411,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{
const sector_t this_sector = r1_bio->sector;
- int new_disk = conf->last_used, disk = new_disk;
- int wonly_disk = -1;
const int sectors = r1_bio->sectors;
- sector_t new_distance, current_distance;
+ int start_disk;
+ int best_disk;
+ int i;
+ sector_t best_dist;
mdk_rdev_t *rdev;
+ int choose_first;
rcu_read_lock();
/*
@@ -424,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
* We take the first readable disk when above the resync window.
*/
retry:
+ best_disk = -1;
+ best_dist = MaxSector;
if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) {
- /* Choose the first operational device, for consistancy */
- new_disk = 0;
-
- for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
- r1_bio->bios[new_disk] == IO_BLOCKED ||
- !rdev || !test_bit(In_sync, &rdev->flags)
- || test_bit(WriteMostly, &rdev->flags);
- rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
-
- if (rdev && test_bit(In_sync, &rdev->flags) &&
- r1_bio->bios[new_disk] != IO_BLOCKED)
- wonly_disk = new_disk;
-
- if (new_disk == conf->raid_disks - 1) {
- new_disk = wonly_disk;
- break;
- }
- }
- goto rb_out;
- }
-
-
- /* make sure the disk is operational */
- for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
- r1_bio->bios[new_disk] == IO_BLOCKED ||
- !rdev || !test_bit(In_sync, &rdev->flags) ||
- test_bit(WriteMostly, &rdev->flags);
- rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
-
- if (rdev && test_bit(In_sync, &rdev->flags) &&
- r1_bio->bios[new_disk] != IO_BLOCKED)
- wonly_disk = new_disk;
-
- if (new_disk <= 0)
- new_disk = conf->raid_disks;
- new_disk--;
- if (new_disk == disk) {
- new_disk = wonly_disk;
- break;
- }
+ choose_first = 1;
+ start_disk = 0;
+ } else {
+ choose_first = 0;
+ start_disk = conf->last_used;
}
- if (new_disk < 0)
- goto rb_out;
-
- disk = new_disk;
- /* now disk == new_disk == starting point for search */
-
- /*
- * Don't change to another disk for sequential reads:
- */
- if (conf->next_seq_sect == this_sector)
- goto rb_out;
- if (this_sector == conf->mirrors[new_disk].head_position)
- goto rb_out;
-
- current_distance = abs(this_sector - conf->mirrors[disk].head_position);
-
- /* Find the disk whose head is closest */
-
- do {
- if (disk <= 0)
- disk = conf->raid_disks;
- disk--;
+ for (i = 0 ; i < conf->raid_disks ; i++) {
+ sector_t dist;
+ int disk = start_disk + i;
+ if (disk >= conf->raid_disks)
+ disk -= conf->raid_disks;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
-
- if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags) ||
- test_bit(WriteMostly, &rdev->flags))
+ if (r1_bio->bios[disk] == IO_BLOCKED
+ || rdev == NULL
+ || test_bit(Faulty, &rdev->flags))
continue;
-
- if (!atomic_read(&rdev->nr_pending)) {
- new_disk = disk;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < this_sector + sectors)
+ continue;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ /* Don't balance among write-mostly, just
+ * use the first as a last resort */
+ if (best_disk < 0)
+ best_disk = disk;
+ continue;
+ }
+ /* This is a reasonable device to use. It might
+ * even be best.
+ */
+ dist = abs(this_sector - conf->mirrors[disk].head_position);
+ if (choose_first
+ /* Don't change to another disk for sequential reads */
+ || conf->next_seq_sect == this_sector
+ || dist == 0
+ /* If device is idle, use it */
+ || atomic_read(&rdev->nr_pending) == 0) {
+ best_disk = disk;
break;
}
- new_distance = abs(this_sector - conf->mirrors[disk].head_position);
- if (new_distance < current_distance) {
- current_distance = new_distance;
- new_disk = disk;
+ if (dist < best_dist) {
+ best_dist = dist;
+ best_disk = disk;
}
- } while (disk != conf->last_used);
-
- rb_out:
-
+ }
- if (new_disk >= 0) {
- rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+ if (best_disk >= 0) {
+ rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
- if (!test_bit(In_sync, &rdev->flags)) {
+ if (test_bit(Faulty, &rdev->flags)) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
@@ -525,42 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
goto retry;
}
conf->next_seq_sect = this_sector + sectors;
- conf->last_used = new_disk;
- }
- rcu_read_unlock();
-
- return new_disk;
-}
-
-static void unplug_slaves(mddev_t *mddev)
-{
- conf_t *conf = mddev->private;
- int i;
-
- rcu_read_lock();
- for (i=0; i<mddev->raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
- struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- blk_unplug(r_queue);
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
+ conf->last_used = best_disk;
}
rcu_read_unlock();
-}
-static void raid1_unplug(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
-
- unplug_slaves(mddev);
- md_wakeup_thread(mddev->thread);
+ return best_disk;
}
static int raid1_congested(void *data, int bits)
@@ -592,20 +526,16 @@ static int raid1_congested(void *data, int bits)
}
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
{
/* Any writes that have been queued but are awaiting
* bitmap updates get flushed here.
- * We return 1 if any requests were actually submitted.
*/
- int rv = 0;
-
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) {
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
- blk_remove_plug(conf->mddev->queue);
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to
* disk before proceeding w/ I/O */
@@ -617,10 +547,8 @@ static int flush_pending_writes(conf_t *conf)
generic_make_request(bio);
bio = next;
}
- rv = 1;
} else
spin_unlock_irq(&conf->device_lock);
- return rv;
}
/* Barriers....
@@ -652,17 +580,15 @@ static void raise_barrier(conf_t *conf)
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
- conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ conf->resync_lock, );
/* block any new IO from starting */
conf->barrier++;
- /* No wait for all pending IO to complete */
+ /* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ conf->resync_lock, );
spin_unlock_irq(&conf->resync_lock);
}
@@ -684,7 +610,7 @@ static void wait_barrier(conf_t *conf)
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -720,8 +646,7 @@ static void freeze_array(conf_t *conf)
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- ({ flush_pending_writes(conf);
- raid1_unplug(conf->mddev->queue); }));
+ flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(conf_t *conf)
@@ -735,15 +660,16 @@ static void unfreeze_array(conf_t *conf)
}
-/* duplicate the data pages for behind I/O */
-static struct page **alloc_behind_pages(struct bio *bio)
+/* duplicate the data pages for behind I/O
+ */
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
{
int i;
struct bio_vec *bvec;
- struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
+ struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
GFP_NOIO);
if (unlikely(!pages))
- goto do_sync_io;
+ return;
bio_for_each_segment(bvec, bio, i) {
pages[i] = alloc_page(GFP_NOIO);
@@ -754,16 +680,17 @@ static struct page **alloc_behind_pages(struct bio *bio)
kunmap(pages[i]);
kunmap(bvec->bv_page);
}
-
- return pages;
+ r1_bio->behind_pages = pages;
+ r1_bio->behind_page_count = bio->bi_vcnt;
+ set_bit(R1BIO_BehindIO, &r1_bio->state);
+ return;
do_sync_io:
- if (pages)
- for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
+ for (i = 0; i < bio->bi_vcnt; i++)
+ if (pages[i])
put_page(pages[i]);
kfree(pages);
PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
- return NULL;
}
static int make_request(mddev_t *mddev, struct bio * bio)
@@ -775,12 +702,11 @@ static int make_request(mddev_t *mddev, struct bio * bio)
int i, targets = 0, disks;
struct bitmap *bitmap;
unsigned long flags;
- struct bio_list bl;
- struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
+ int plugged;
/*
* Register the new request and wait if the reconstruction
@@ -851,7 +777,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
}
r1_bio->read_disk = rdisk;
- read_bio = bio_clone(bio, GFP_NOIO);
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
r1_bio->bios[rdisk] = read_bio;
@@ -872,14 +798,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ plugged = mddev_check_plugged(mddev);
+
disks = conf->raid_disks;
-#if 0
- { static int first=1;
- if (first) printk("First Write sector %llu disks %d\n",
- (unsigned long long)r1_bio->sector, disks);
- first = 0;
- }
-#endif
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
@@ -933,20 +854,20 @@ static int make_request(mddev_t *mddev, struct bio * bio)
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait) &&
- (behind_pages = alloc_behind_pages(bio)) != NULL)
- set_bit(R1BIO_BehindIO, &r1_bio->state);
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(bio, r1_bio);
- atomic_set(&r1_bio->remaining, 0);
+ atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
- bio_list_init(&bl);
+ bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
for (i = 0; i < disks; i++) {
struct bio *mbio;
if (!r1_bio->bios[i])
continue;
- mbio = bio_clone(bio, GFP_NOIO);
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
r1_bio->bios[i] = mbio;
mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
@@ -955,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio;
- if (behind_pages) {
+ if (r1_bio->behind_pages) {
struct bio_vec *bvec;
int j;
@@ -963,39 +884,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* we clear any unused pointer in the io_vec, rather
* than leave them unchanged. This is important
* because when we come to free the pages, we won't
- * know the originial bi_idx, so we just free
+ * know the original bi_idx, so we just free
* them all
*/
__bio_for_each_segment(bvec, mbio, j, 0)
- bvec->bv_page = behind_pages[j];
+ bvec->bv_page = r1_bio->behind_pages[j];
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
atomic_inc(&r1_bio->remaining);
-
- bio_list_add(&bl, mbio);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
- kfree(behind_pages); /* the behind pages are attached to the bios now */
+ r1_bio_write_done(r1_bio);
- bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
- spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_merge(&conf->pending_bio_list, &bl);
- bio_list_init(&bl);
-
- blk_plug_device(mddev->queue);
- spin_unlock_irqrestore(&conf->device_lock, flags);
-
- /* In case raid1d snuck into freeze_array */
+ /* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
- if (do_sync)
+ if (do_sync || !bitmap || !plugged)
md_wakeup_thread(mddev->thread);
-#if 0
- while ((bio = bio_list_pop(&bl)) != NULL)
- generic_make_request(bio);
-#endif
return 0;
}
@@ -1053,8 +962,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
@@ -1183,10 +1093,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
err = -EBUSY;
goto abort;
}
- /* Only remove non-faulty devices is recovery
+ /* Only remove non-faulty devices if recovery
* is not possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
+ !mddev->recovery_disabled &&
mddev->degraded < conf->raid_disks) {
err = -EBUSY;
goto abort;
@@ -1199,7 +1110,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
p->rdev = rdev;
goto abort;
}
- md_integrity_register(mddev);
+ err = md_integrity_register(mddev);
}
abort:
@@ -1245,7 +1156,7 @@ static void end_sync_write(struct bio *bio, int error)
break;
}
if (!uptodate) {
- int sync_blocks = 0;
+ sector_t sync_blocks = 0;
sector_t s = r1_bio->sector;
long sectors_to_go = r1_bio->sectors;
/* make sure these bits doesn't get cleared. */
@@ -1267,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error)
}
}
-static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+static int fix_sync_read_error(r1bio_t *r1_bio)
{
+ /* Try some synchronous reads of other devices to get
+ * good data, much like with normal read errors. Only
+ * read into the pages we already have so we don't
+ * need to re-issue the read request.
+ * We don't need to freeze the array, because being in an
+ * active sync request, there is no normal IO, and
+ * no overlapping syncs.
+ */
+ mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
- int i;
- int disks = conf->raid_disks;
- struct bio *bio, *wbio;
-
- bio = r1_bio->bios[r1_bio->read_disk];
+ struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ sector_t sect = r1_bio->sector;
+ int sectors = r1_bio->sectors;
+ int idx = 0;
+ while(sectors) {
+ int s = sectors;
+ int d = r1_bio->read_disk;
+ int success = 0;
+ mdk_rdev_t *rdev;
+ int start;
- if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* We have read all readable devices. If we haven't
- * got the block, then there is no hope left.
- * If we have, then we want to do a comparison
- * and skip the write if everything is the same.
- * If any blocks failed to read, then we need to
- * attempt an over-write
- */
- int primary;
- if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- for (i=0; i<mddev->raid_disks; i++)
- if (r1_bio->bios[i]->bi_end_io == end_sync_read)
- md_error(mddev, conf->mirrors[i].rdev);
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+ do {
+ if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+ /* No rcu protection needed here devices
+ * can only be removed when no resync is
+ * active, and resync is currently active
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false)) {
+ success = 1;
+ break;
+ }
+ }
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ } while (!success && d != r1_bio->read_disk);
- md_done_sync(mddev, r1_bio->sectors, 1);
+ if (!success) {
+ char b[BDEVNAME_SIZE];
+ /* Cannot read from anywhere, array is toast */
+ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+ " for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev, b),
+ (unsigned long long)r1_bio->sector);
+ md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
- return;
+ return 0;
}
- for (primary=0; primary<mddev->raid_disks; primary++)
- if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
- test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
- r1_bio->bios[primary]->bi_end_io = NULL;
- rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
- break;
- }
- r1_bio->read_disk = primary;
- for (i=0; i<mddev->raid_disks; i++)
- if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
- int j;
- int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
- struct bio *pbio = r1_bio->bios[primary];
- struct bio *sbio = r1_bio->bios[i];
-
- if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
- for (j = vcnt; j-- ; ) {
- struct page *p, *s;
- p = pbio->bi_io_vec[j].bv_page;
- s = sbio->bi_io_vec[j].bv_page;
- if (memcmp(page_address(p),
- page_address(s),
- PAGE_SIZE))
- break;
- }
- } else
- j = 0;
- if (j >= 0)
- mddev->resync_mismatches += r1_bio->sectors;
- if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
- && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
- sbio->bi_end_io = NULL;
- rdev_dec_pending(conf->mirrors[i].rdev, mddev);
- } else {
- /* fixup the bio for reuse */
- int size;
- sbio->bi_vcnt = vcnt;
- sbio->bi_size = r1_bio->sectors << 9;
- sbio->bi_idx = 0;
- sbio->bi_phys_segments = 0;
- sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
- sbio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bi_next = NULL;
- sbio->bi_sector = r1_bio->sector +
- conf->mirrors[i].rdev->data_offset;
- sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- size = sbio->bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &sbio->bi_io_vec[j];
- bi->bv_offset = 0;
- if (size > PAGE_SIZE)
- bi->bv_len = PAGE_SIZE;
- else
- bi->bv_len = size;
- size -= PAGE_SIZE;
- memcpy(page_address(bi->bv_page),
- page_address(pbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
- }
- }
- }
+ start = d;
+ /* write it back and re-read */
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE, false) == 0) {
+ r1_bio->bios[d]->bi_end_io = NULL;
+ rdev_dec_pending(rdev, mddev);
+ md_error(mddev, rdev);
+ } else
+ atomic_add(s, &rdev->corrected_errors);
+ }
+ d = start;
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false) == 0)
+ md_error(mddev, rdev);
+ }
+ sectors -= s;
+ sect += s;
+ idx ++;
}
- if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- /* ouch - failed to read all of that.
- * Try some synchronous reads of other devices to get
- * good data, much like with normal read errors. Only
- * read into the pages we already have so we don't
- * need to re-issue the read request.
- * We don't need to freeze the array, because being in an
- * active sync request, there is no normal IO, and
- * no overlapping syncs.
- */
- sector_t sect = r1_bio->sector;
- int sectors = r1_bio->sectors;
- int idx = 0;
-
- while(sectors) {
- int s = sectors;
- int d = r1_bio->read_disk;
- int success = 0;
- mdk_rdev_t *rdev;
-
- if (s > (PAGE_SIZE>>9))
- s = PAGE_SIZE >> 9;
- do {
- if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
- /* No rcu protection needed here devices
- * can only be removed when no resync is
- * active, and resync is currently active
- */
- rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- READ)) {
- success = 1;
- break;
- }
- }
- d++;
- if (d == conf->raid_disks)
- d = 0;
- } while (!success && d != r1_bio->read_disk);
-
- if (success) {
- int start = d;
- /* write it back and re-read */
- set_bit(R1BIO_Uptodate, &r1_bio->state);
- while (d != r1_bio->read_disk) {
- if (d == 0)
- d = conf->raid_disks;
- d--;
- if (r1_bio->bios[d]->bi_end_io != end_sync_read)
- continue;
- rdev = conf->mirrors[d].rdev;
- atomic_add(s, &rdev->corrected_errors);
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- WRITE) == 0)
- md_error(mddev, rdev);
- }
- d = start;
- while (d != r1_bio->read_disk) {
- if (d == 0)
- d = conf->raid_disks;
- d--;
- if (r1_bio->bios[d]->bi_end_io != end_sync_read)
- continue;
- rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- READ) == 0)
- md_error(mddev, rdev);
- }
- } else {
- char b[BDEVNAME_SIZE];
- /* Cannot read from anywhere, array is toast */
- md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
- printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
- " for block %llu\n",
- mdname(mddev),
- bdevname(bio->bi_bdev, b),
- (unsigned long long)r1_bio->sector);
- md_done_sync(mddev, r1_bio->sectors, 0);
- put_buf(r1_bio);
- return;
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ return 1;
+}
+
+static int process_checks(r1bio_t *r1_bio)
+{
+ /* We have read all readable devices. If we haven't
+ * got the block, then there is no hope left.
+ * If we have, then we want to do a comparison
+ * and skip the write if everything is the same.
+ * If any blocks failed to read, then we need to
+ * attempt an over-write
+ */
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ int primary;
+ int i;
+
+ for (primary = 0; primary < conf->raid_disks; primary++)
+ if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+ test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ r1_bio->bios[primary]->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+ break;
+ }
+ r1_bio->read_disk = primary;
+ for (i = 0; i < conf->raid_disks; i++) {
+ int j;
+ int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+ struct bio *pbio = r1_bio->bios[primary];
+ struct bio *sbio = r1_bio->bios[i];
+ int size;
+
+ if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+ continue;
+
+ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ PAGE_SIZE))
+ break;
}
- sectors -= s;
- sect += s;
- idx ++;
+ } else
+ j = 0;
+ if (j >= 0)
+ mddev->resync_mismatches += r1_bio->sectors;
+ if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+ && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+ /* No need to write to this device. */
+ sbio->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+ continue;
+ }
+ /* fixup the bio for reuse */
+ sbio->bi_vcnt = vcnt;
+ sbio->bi_size = r1_bio->sectors << 9;
+ sbio->bi_idx = 0;
+ sbio->bi_phys_segments = 0;
+ sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bi_next = NULL;
+ sbio->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ size = sbio->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &sbio->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ memcpy(page_address(bi->bv_page),
+ page_address(pbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
}
}
+ return 0;
+}
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+ conf_t *conf = mddev->private;
+ int i;
+ int disks = conf->raid_disks;
+ struct bio *bio, *wbio;
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ /* ouch - failed to read all of that. */
+ if (!fix_sync_read_error(r1_bio))
+ return;
+
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ if (process_checks(r1_bio) < 0)
+ return;
/*
* schedule writes
*/
@@ -1513,10 +1440,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
- sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ))
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
success = 1;
else {
d++;
@@ -1539,9 +1464,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, WRITE, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
@@ -1556,9 +1480,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
@@ -1586,15 +1509,17 @@ static void raid1d(mddev_t *mddev)
unsigned long flags;
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
- int unplug=0;
mdk_rdev_t *rdev;
+ struct blk_plug plug;
md_check_recovery(mddev);
-
+
+ blk_start_plug(&plug);
for (;;) {
char b[BDEVNAME_SIZE];
- unplug += flush_pending_writes(conf);
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ flush_pending_writes(conf);
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head)) {
@@ -1608,10 +1533,9 @@ static void raid1d(mddev_t *mddev)
mddev = r1_bio->mddev;
conf = mddev->private;
- if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+ if (test_bit(R1BIO_IsSync, &r1_bio->state))
sync_request_write(mddev, r1_bio);
- unplug = 1;
- } else {
+ else {
int disk;
/* we got a read error. Maybe the drive is bad. Maybe just
@@ -1646,7 +1570,8 @@ static void raid1d(mddev_t *mddev)
mddev->ro ? IO_BLOCKED : NULL;
r1_bio->read_disk = disk;
bio_put(bio);
- bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+ bio = bio_clone_mddev(r1_bio->master_bio,
+ GFP_NOIO, mddev);
r1_bio->bios[r1_bio->read_disk] = bio;
rdev = conf->mirrors[disk].rdev;
if (printk_ratelimit())
@@ -1660,14 +1585,12 @@ static void raid1d(mddev_t *mddev)
bio->bi_end_io = raid1_end_read_request;
bio->bi_rw = READ | do_sync;
bio->bi_private = r1_bio;
- unplug = 1;
generic_make_request(bio);
}
}
cond_resched();
}
- if (unplug)
- unplug_slaves(mddev);
+ blk_finish_plug(&plug);
}
@@ -1705,7 +1628,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int i;
int wonly = -1;
int write_targets = 0, read_targets = 0;
- int sync_blocks;
+ sector_t sync_blocks;
int still_degraded = 0;
if (!conf->r1buf_pool)
@@ -1755,11 +1678,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
msleep_interruptible(1000);
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
raise_barrier(conf);
conf->next_resync = sector_nr;
- r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
rcu_read_lock();
/*
* If we get a correctably read error during resync or recovery,
@@ -1971,7 +1894,6 @@ static conf_t *setup_conf(mddev_t *mddev)
init_waitqueue_head(&conf->wait_barrier);
bio_list_init(&conf->pending_bio_list);
- bio_list_init(&conf->flushing_bio_list);
conf->last_used = -1;
for (i = 0; i < conf->raid_disks; i++) {
@@ -2049,7 +1971,6 @@ static int run(mddev_t *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- mddev->queue->queue_lock = &conf->device_lock;
list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -2092,11 +2013,9 @@ static int run(mddev_t *mddev)
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
- mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
- md_integrity_register(mddev);
- return 0;
+ return md_integrity_register(mddev);
}
static int stop(mddev_t *mddev)
@@ -2118,7 +2037,6 @@ static int stop(mddev_t *mddev)
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
@@ -2143,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
- mddev->recovery_cp == MaxSector) {
+ mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}