summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c192
1 files changed, 127 insertions, 65 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3ee2912889e..9359828ffe2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
- int sectors = bio->bi_size >> 9;
+ int sectors = bio_sectors(bio);
if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
return bio->bi_next;
else
@@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
bi = return_bi;
}
@@ -567,14 +569,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi = &sh->dev[i].req;
rbi = &sh->dev[i].rreq; /* For writing to replacement */
- bi->bi_rw = rw;
- rbi->bi_rw = rw;
- if (rw & WRITE) {
- bi->bi_end_io = raid5_end_write_request;
- rbi->bi_end_io = raid5_end_write_request;
- } else
- bi->bi_end_io = raid5_end_read_request;
-
rcu_read_lock();
rrdev = rcu_dereference(conf->disks[i].replacement);
smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@ -649,7 +643,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(STRIPE_IO_STARTED, &sh->state);
+ bio_reset(bi);
bi->bi_bdev = rdev->bdev;
+ bi->bi_rw = rw;
+ bi->bi_end_io = (rw & WRITE)
+ ? raid5_end_write_request
+ : raid5_end_read_request;
+ bi->bi_private = sh;
+
pr_debug("%s: for %llu schedule op %ld on disc %d\n",
__func__, (unsigned long long)sh->sector,
bi->bi_rw, i);
@@ -663,17 +664,16 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_FLUSH;
- bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
- bi->bi_next = NULL;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
- trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
- bi, disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+ bi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(bi);
}
if (rrdev) {
@@ -683,7 +683,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(STRIPE_IO_STARTED, &sh->state);
+ bio_reset(rbi);
rbi->bi_bdev = rrdev->bdev;
+ rbi->bi_rw = rw;
+ BUG_ON(!(rw & WRITE));
+ rbi->bi_end_io = raid5_end_write_request;
+ rbi->bi_private = sh;
+
pr_debug("%s: for %llu schedule op %ld on "
"replacement disc %d\n",
__func__, (unsigned long long)sh->sector,
@@ -695,15 +701,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
else
rbi->bi_sector = (sh->sector
+ rrdev->data_offset);
- rbi->bi_flags = 1 << BIO_UPTODATE;
- rbi->bi_idx = 0;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
- rbi->bi_next = NULL;
- trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
- rbi, disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+ rbi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(rbi);
}
if (!rdev && !rrdev) {
@@ -1882,8 +1886,15 @@ static void raid5_end_write_request(struct bio *bi, int error)
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
- &first_bad, &bad_sectors))
+ &first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
+ if (test_bit(R5_ReadError, &sh->dev[i].flags))
+ /* That was a successful write so make
+ * sure it looks like we already did
+ * a re-write.
+ */
+ set_bit(R5_ReWrite, &sh->dev[i].flags);
+ }
}
rdev_dec_pending(rdev, conf->mddev);
@@ -2280,17 +2291,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int level = conf->level;
if (rcw) {
- /* if we are not expanding this is a proper write request, and
- * there will be bios with new data to be drained into the
- * stripe cache
- */
- if (!expand) {
- sh->reconstruct_state = reconstruct_state_drain_run;
- set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- } else
- sh->reconstruct_state = reconstruct_state_run;
-
- set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -2303,6 +2303,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
+ /* if we are not expanding this is a proper write request, and
+ * there will be bios with new data to be drained into the
+ * stripe cache
+ */
+ if (!expand) {
+ if (!s->locked)
+ /* False alarm, nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_drain_run;
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ } else
+ sh->reconstruct_state = reconstruct_state_run;
+
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+
if (s->locked + conf->max_degraded == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&conf->pending_full_writes);
@@ -2311,11 +2326,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
- sh->reconstruct_state = reconstruct_state_prexor_drain_run;
- set_bit(STRIPE_OP_PREXOR, &s->ops_request);
- set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
-
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i == pd_idx)
@@ -2330,6 +2340,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
+ if (!s->locked)
+ /* False alarm - nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+ set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
}
/* keep the parity disk(s) locked while asynchronous operations
@@ -2384,11 +2401,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
} else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
- if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+ if (bio_end_sector(*bip) > bi->bi_sector)
goto overlap;
bip = & (*bip)->bi_next;
}
- if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+ if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
goto overlap;
BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2404,8 +2421,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
bi && bi->bi_sector <= sector;
bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
- if (bi->bi_sector + (bi->bi_size>>9) >= sector)
- sector = bi->bi_sector + (bi->bi_size>>9);
+ if (bio_end_sector(bi) >= sector)
+ sector = bio_end_sector(bi);
}
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@ -2564,6 +2581,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
int i;
clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -2737,6 +2756,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
{
int i;
struct r5dev *dev;
+ int discard_pending = 0;
for (i = disks; i--; )
if (sh->dev[i].written) {
@@ -2765,9 +2785,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
- }
- } else if (test_bit(R5_Discard, &sh->dev[i].flags))
- clear_bit(R5_Discard, &sh->dev[i].flags);
+ } else if (test_bit(R5_Discard, &dev->flags))
+ discard_pending = 1;
+ }
+ if (!discard_pending &&
+ test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ if (sh->qd_idx >= 0) {
+ clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
+ }
+ /* now that discard is done we can proceed with any sync */
+ clear_bit(STRIPE_DISCARD, &sh->state);
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ }
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2826,8 +2860,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if (rmw < rcw && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
- (unsigned long long)sh->sector, rmw);
+ if (conf->mddev->queue)
+ blk_add_trace_msg(conf->mddev->queue,
+ "raid5 rmw %llu %d",
+ (unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
@@ -2877,7 +2913,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
- if (rcw)
+ if (rcw && conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector,
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
@@ -3417,9 +3453,15 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
- if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
- set_bit(STRIPE_SYNCING, &sh->state);
- clear_bit(STRIPE_INSYNC, &sh->state);
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ spin_lock(&sh->stripe_lock);
+ /* Cannot process 'sync' concurrently with 'discard' */
+ if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ set_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ }
+ spin_unlock(&sh->stripe_lock);
}
clear_bit(STRIPE_DELAYED, &sh->state);
@@ -3579,6 +3621,8 @@ static void handle_stripe(struct stripe_head *sh)
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -3804,7 +3848,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bio->bi_size >> 9;
+ unsigned int bio_sectors = bio_sectors(bio);
if (mddev->new_chunk_sectors < mddev->chunk_sectors)
chunk_sectors = mddev->new_chunk_sectors;
@@ -3878,6 +3922,8 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
if (!error && uptodate) {
+ trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+ raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
@@ -3894,7 +3940,7 @@ static int bio_fits_rdev(struct bio *bi)
{
struct request_queue *q = bdev_get_queue(bi->bi_bdev);
- if ((bi->bi_size>>9) > queue_max_sectors(q))
+ if (bio_sectors(bi) > queue_max_sectors(q))
return 0;
blk_recount_segments(q, bi);
if (bi->bi_phys_segments > queue_max_segments(q))
@@ -3941,7 +3987,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
0,
&dd_idx, NULL);
- end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+ end_sector = bio_end_sector(align_bi);
rcu_read_lock();
rdev = rcu_dereference(conf->disks[dd_idx].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -3964,7 +4010,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+ is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
/* too big in some way, or has a known bad block */
bio_put(align_bi);
@@ -3982,9 +4028,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
atomic_inc(&conf->active_aligned_reads);
spin_unlock_irq(&conf->device_lock);
- trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
- align_bi, disk_devt(mddev->gendisk),
- raid_bio->bi_sector);
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+ align_bi, disk_devt(mddev->gendisk),
+ raid_bio->bi_sector);
generic_make_request(align_bi);
return 1;
} else {
@@ -4078,7 +4125,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
spin_unlock_irq(&conf->device_lock);
}
- trace_block_unplug(mddev->queue, cnt, !from_schedule);
+ if (mddev->queue)
+ trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -4141,6 +4189,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
+ set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+ if (test_bit(STRIPE_SYNCING, &sh->state)) {
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
spin_lock_irq(&sh->stripe_lock);
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4153,6 +4208,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
goto again;
}
}
+ set_bit(STRIPE_DISCARD, &sh->state);
finish_wait(&conf->wait_for_overlap, &w);
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4216,7 +4272,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
}
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_sector + (bi->bi_size>>9);
+ last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4336,6 +4392,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
}
}
@@ -4620,9 +4678,10 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
*skipped = 1;
return rv;
}
- if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
- !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ !conf->fullsync &&
+ !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ sync_blocks >= STRIPE_SECTORS) {
/* we can skip this block, and probably more */
sync_blocks /= STRIPE_SECTORS;
*skipped = 1;
@@ -4679,7 +4738,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
- last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+ last_sector = bio_end_sector(raid_bio);
for (; logical_sector < last_sector;
logical_sector += STRIPE_SECTORS,
@@ -4712,8 +4771,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0)
+ if (remaining == 0) {
+ trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ raid_bio, 0);
bio_endio(raid_bio, 0);
+ }
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
return handled;