summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c240
1 files changed, 144 insertions, 96 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a93ca478142..3066c587b53 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,7 +52,7 @@ static mdk_personality_t raid1_personality;
static void unplug_slaves(mddev_t *mddev);
-static void * r1bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
r1bio_t *r1_bio;
@@ -79,7 +79,7 @@ static void r1bio_pool_free(void *r1_bio, void *data)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (2048*1024)
-static void * r1buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
struct page *page;
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
- int mirror, behind;
+ int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
conf_t *conf = mddev_to_conf(r1_bio->mddev);
if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
if (r1_bio->bios[mirror] == bio)
break;
- /*
- * this branch is our 'one mirror IO has finished' event handler:
- */
- if (!uptodate) {
- md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
- /* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- } else
+ if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
+ set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
+ set_bit(R1BIO_BarrierRetry, &r1_bio->state);
+ r1_bio->mddev->barriers_work = 0;
+ } else {
/*
- * Set R1BIO_Uptodate in our master bio, so that
- * we will return a good error code for to the higher
- * levels even if IO on some other mirrored buffer fails.
- *
- * The 'master' represents the composite IO operation to
- * user-side. So if something waits for IO, then it will
- * wait for the 'master' bio.
+ * this branch is our 'one mirror IO has finished' event handler:
*/
- set_bit(R1BIO_Uptodate, &r1_bio->state);
-
- update_head_pos(mirror, r1_bio);
-
- behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
- if (behind) {
- if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
- atomic_dec(&r1_bio->behind_remaining);
-
- /* In behind mode, we ACK the master bio once the I/O has safely
- * reached all non-writemostly disks. Setting the Returned bit
- * ensures that this gets done only once -- we don't ever want to
- * return -EIO here, instead we'll wait */
-
- if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
- test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- /* Maybe we can return now */
- if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
- struct bio *mbio = r1_bio->master_bio;
- PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
- (unsigned long long) mbio->bi_sector,
- (unsigned long long) mbio->bi_sector +
- (mbio->bi_size >> 9) - 1);
- bio_endio(mbio, mbio->bi_size, 0);
+ r1_bio->bios[mirror] = NULL;
+ bio_put(bio);
+ if (!uptodate) {
+ md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
+ /*
+ * Set R1BIO_Uptodate in our master bio, so that
+ * we will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer fails.
+ *
+ * The 'master' represents the composite IO operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' bio.
+ */
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ update_head_pos(mirror, r1_bio);
+
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /* In behind mode, we ACK the master bio once the I/O has safely
+ * reached all non-writemostly disks. Setting the Returned bit
+ * ensures that this gets done only once -- we don't ever want to
+ * return -EIO here, instead we'll wait */
+
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+ (unsigned long long) mbio->bi_sector,
+ (unsigned long long) mbio->bi_sector +
+ (mbio->bi_size >> 9) - 1);
+ bio_endio(mbio, mbio->bi_size, 0);
+ }
}
}
}
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
* already.
*/
if (atomic_dec_and_test(&r1_bio->remaining)) {
+ if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+ reschedule_retry(r1_bio);
+ /* Don't dec_pending yet, we want to hold
+ * the reference over the retry
+ */
+ return 0;
+ }
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
/* free extra copy of the data pages */
+/* FIXME bio has been freed!!! */
int i = bio->bi_vcnt;
while (i--)
__free_page(bio->bi_io_vec[i].bv_page);
@@ -416,12 +431,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* Choose the first operation device, for consistancy */
new_disk = 0;
- for (rdev = conf->mirrors[new_disk].rdev;
- !rdev || !rdev->in_sync
+ for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+ !rdev || !test_bit(In_sync, &rdev->flags)
|| test_bit(WriteMostly, &rdev->flags);
- rdev = conf->mirrors[++new_disk].rdev) {
+ rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
- if (rdev && rdev->in_sync)
+ if (rdev && test_bit(In_sync, &rdev->flags))
wonly_disk = new_disk;
if (new_disk == conf->raid_disks - 1) {
@@ -434,12 +449,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* make sure the disk is operational */
- for (rdev = conf->mirrors[new_disk].rdev;
- !rdev || !rdev->in_sync ||
+ for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+ !rdev || !test_bit(In_sync, &rdev->flags) ||
test_bit(WriteMostly, &rdev->flags);
- rdev = conf->mirrors[new_disk].rdev) {
+ rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
- if (rdev && rdev->in_sync)
+ if (rdev && test_bit(In_sync, &rdev->flags))
wonly_disk = new_disk;
if (new_disk <= 0)
@@ -474,10 +489,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
disk = conf->raid_disks;
disk--;
- rdev = conf->mirrors[disk].rdev;
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (!rdev ||
- !rdev->in_sync ||
+ !test_bit(In_sync, &rdev->flags) ||
test_bit(WriteMostly, &rdev->flags))
continue;
@@ -496,11 +511,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
if (new_disk >= 0) {
- rdev = conf->mirrors[new_disk].rdev;
+ rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
- if (!rdev->in_sync) {
+ if (!test_bit(In_sync, &rdev->flags)) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
@@ -522,8 +537,8 @@ static void unplug_slaves(mddev_t *mddev)
rcu_read_lock();
for (i=0; i<mddev->raid_disks; i++) {
- mdk_rdev_t *rdev = conf->mirrors[i].rdev;
- if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
atomic_inc(&rdev->nr_pending);
@@ -556,8 +571,8 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
rcu_read_lock();
for (i=0; i<mddev->raid_disks && ret == 0; i++) {
- mdk_rdev_t *rdev = conf->mirrors[i].rdev;
- if (rdev && !rdev->faulty) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct block_device *bdev = rdev->bdev;
request_queue_t *r_queue = bdev_get_queue(bdev);
@@ -647,8 +662,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
unsigned long flags;
struct bio_list bl;
struct page **behind_pages = NULL;
+ const int rw = bio_data_dir(bio);
+ int do_barriers;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
return 0;
}
@@ -665,13 +682,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock);
- if (bio_data_dir(bio)==WRITE) {
- disk_stat_inc(mddev->gendisk, writes);
- disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
- } else {
- disk_stat_inc(mddev->gendisk, reads);
- disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
- }
+ disk_stat_inc(mddev->gendisk, ios[rw]);
+ disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
/*
* make_request() can abort the operation when READA is being
@@ -686,7 +698,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
- if (bio_data_dir(bio) == READ) {
+ if (rw == READ) {
/*
* read balancing logic:
*/
@@ -732,10 +744,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
#endif
rcu_read_lock();
for (i = 0; i < disks; i++) {
- if ((rdev=conf->mirrors[i].rdev) != NULL &&
- !rdev->faulty) {
+ if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
+ !test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
- if (rdev->faulty) {
+ if (test_bit(Faulty, &rdev->flags)) {
atomic_dec(&rdev->nr_pending);
r1_bio->bios[i] = NULL;
} else
@@ -763,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
atomic_set(&r1_bio->remaining, 0);
atomic_set(&r1_bio->behind_remaining, 0);
+ do_barriers = bio->bi_rw & BIO_RW_BARRIER;
+ if (do_barriers)
+ set_bit(R1BIO_Barrier, &r1_bio->state);
+
bio_list_init(&bl);
for (i = 0; i < disks; i++) {
struct bio *mbio;
@@ -775,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE;
+ mbio->bi_rw = WRITE | do_barriers;
mbio->bi_private = r1_bio;
if (behind_pages) {
@@ -828,7 +844,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
for (i = 0; i < conf->raid_disks; i++)
seq_printf(seq, "%s",
conf->mirrors[i].rdev &&
- conf->mirrors[i].rdev->in_sync ? "U" : "_");
+ test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
seq_printf(seq, "]");
}
@@ -844,14 +860,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* next level up know.
* else mark the drive as failed
*/
- if (rdev->in_sync
+ if (test_bit(In_sync, &rdev->flags)
&& conf->working_disks == 1)
/*
* Don't fail the drive, act as though we were just a
* normal single drive
*/
return;
- if (rdev->in_sync) {
+ if (test_bit(In_sync, &rdev->flags)) {
mddev->degraded++;
conf->working_disks--;
/*
@@ -859,8 +875,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
}
- rdev->in_sync = 0;
- rdev->faulty = 1;
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Faulty, &rdev->flags);
mddev->sb_dirty = 1;
printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
" Operation continuing on %d devices\n",
@@ -885,7 +901,7 @@ static void print_conf(conf_t *conf)
tmp = conf->mirrors + i;
if (tmp->rdev)
printk(" disk %d, wo:%d, o:%d, dev:%s\n",
- i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
+ i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
bdevname(tmp->rdev->bdev,b));
}
}
@@ -917,11 +933,11 @@ static int raid1_spare_active(mddev_t *mddev)
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i;
if (tmp->rdev
- && !tmp->rdev->faulty
- && !tmp->rdev->in_sync) {
+ && !test_bit(Faulty, &tmp->rdev->flags)
+ && !test_bit(In_sync, &tmp->rdev->flags)) {
conf->working_disks++;
mddev->degraded--;
- tmp->rdev->in_sync = 1;
+ set_bit(In_sync, &tmp->rdev->flags);
}
}
@@ -937,9 +953,6 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int mirror = 0;
mirror_info_t *p;
- if (rdev->saved_raid_disk >= 0 &&
- conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
- mirror = rdev->saved_raid_disk;
for (mirror=0; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
@@ -956,9 +969,12 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
- if (rdev->saved_raid_disk != mirror)
+ /* As all devices are equivalent, we don't need a full recovery
+ * if this was recently any drive of the array
+ */
+ if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
- p->rdev = rdev;
+ rcu_assign_pointer(p->rdev, rdev);
break;
}
@@ -976,7 +992,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
print_conf(conf);
rdev = p->rdev;
if (rdev) {
- if (rdev->in_sync ||
+ if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
goto abort;
@@ -1157,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
sync_request_write(mddev, r1_bio);
unplug = 1;
+ } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+ /* some requests in the r1bio were BIO_RW_BARRIER
+ * requests which failed with -ENOTSUPP. Hohumm..
+ * Better resubmit without the barrier.
+ * We know which devices to resubmit for, because
+ * all others have had their bios[] entry cleared.
+ */
+ int i;
+ clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
+ clear_bit(R1BIO_Barrier, &r1_bio->state);
+ for (i=0; i < conf->raid_disks; i++)
+ if (r1_bio->bios[i]) {
+ struct bio_vec *bvec;
+ int j;
+
+ bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+ /* copy pages from the failed bio, as
+ * this might be a write-behind device */
+ __bio_for_each_segment(bvec, bio, j, 0)
+ bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
+ bio_put(r1_bio->bios[i]);
+ bio->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ bio->bi_end_io = raid1_end_write_request;
+ bio->bi_rw = WRITE;
+ bio->bi_private = r1_bio;
+ r1_bio->bios[i] = bio;
+ generic_make_request(bio);
+ }
} else {
int disk;
bio = r1_bio->bios[r1_bio->read_disk];
@@ -1264,7 +1310,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
* This call the bitmap_start_sync doesn't actually record anything
*/
if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- !conf->fullsync) {
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
return sync_blocks;
@@ -1286,11 +1332,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
/* make sure disk is operational */
wonly = disk;
while (conf->mirrors[disk].rdev == NULL ||
- !conf->mirrors[disk].rdev->in_sync ||
+ !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
) {
if (conf->mirrors[disk].rdev &&
- conf->mirrors[disk].rdev->in_sync)
+ test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
wonly = disk;
if (disk <= 0)
disk = conf->raid_disks;
@@ -1337,11 +1383,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
bio->bi_rw = READ;
bio->bi_end_io = end_sync_read;
} else if (conf->mirrors[i].rdev == NULL ||
- conf->mirrors[i].rdev->faulty) {
+ test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
still_degraded = 1;
continue;
- } else if (!conf->mirrors[i].rdev->in_sync ||
- sector_nr + RESYNC_SECTORS > mddev->recovery_cp) {
+ } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+ sector_nr + RESYNC_SECTORS > mddev->recovery_cp ||
+ test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
bio->bi_rw = WRITE;
bio->bi_end_io = end_sync_write;
write_targets ++;
@@ -1375,8 +1422,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
break;
if (sync_blocks == 0) {
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
- &sync_blocks, still_degraded) &&
- !conf->fullsync)
+ &sync_blocks, still_degraded) &&
+ !conf->fullsync &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
if (sync_blocks < (PAGE_SIZE>>9))
BUG();
@@ -1482,7 +1530,7 @@ static int run(mddev_t *mddev)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->head_position = 0;
- if (!rdev->faulty && rdev->in_sync)
+ if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
conf->working_disks++;
}
conf->raid_disks = mddev->raid_disks;
@@ -1522,7 +1570,7 @@ static int run(mddev_t *mddev)
*/
for (j = 0; j < conf->raid_disks &&
(!conf->mirrors[j].rdev ||
- !conf->mirrors[j].rdev->in_sync) ; j++)
+ !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
/* nothing */;
conf->last_used = j;