From 6be9d4940134b36f9ed020aead36f831f19b49f1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 23 May 2008 13:04:34 -0700 Subject: md: md: raid5 rate limit error printk Last night we had scsi problems and a hardware raid unit was offlined during heavy i/o. While this happened we got for about 3 minutes a huge number messages like these Apr 12 03:36:07 pfs1n14 kernel: [197510.696595] raid5:md7: read error not correctable (sector 2993096568 on sdj2). I guess the high error rate is responsible for not scheduling other events - during this time the system was not pingable and in the end also other devices run into scsi command timeouts causing problems on these unrelated devices as well. Signed-off-by: Bernd Schubert Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93fde48c0f4..2f28745dacf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -94,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING -- cgit v1.2.3-70-g09d2 From dfc7064500061677720fa26352963c772d3ebe6b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 May 2008 13:04:39 -0700 Subject: md: restart recovery cleanly after device failure. When we get any IO error during a recovery (rebuilding a spare), we abort the recovery and restart it. For RAID6 (and multi-drive RAID1) it may not be best to restart at the beginning: when multiple failures can be tolerated, the recovery may be able to continue and re-doing all that has already been done doesn't make sense. We already have the infrastructure to record where a recovery is up to and restart from there, but it is not being used properly. This is because: - We sometimes abort with MD_RECOVERY_ERR rather than just MD_RECOVERY_INTR, which causes the recovery not be be checkpointed. - We remove spares and then re-added them which loses important state information. The distinction between MD_RECOVERY_ERR and MD_RECOVERY_INTR really isn't needed. If there is an error, the relevant drive will be marked as Faulty, and that is enough to ensure correct handling of the error. So we first remove MD_RECOVERY_ERR, changing some of the uses of it to MD_RECOVERY_INTR. Then we cause the attempt to remove a non-faulty device from an array to fail (unless recovery is impossible as the array is too degraded). Then when remove_and_add_spares attempts to remove the devices on which recovery can continue, it will fail, they will remain in place, and recovery will continue on them as desired. Issue: If we are halfway through rebuilding a spare and another drive fails, and a new spare is immediately available, do we want to: 1/ complete the current rebuild, then go back and rebuild the new spare or 2/ restart the rebuild from the start and rebuild both devices in parallel. Both options can be argued for. The code currently takes option 2 as a/ this requires least code change b/ this results in a minimally-degraded array in minimal time. Cc: "Eivind Sarto" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 22 +++++++++++----------- drivers/md/multipath.c | 3 ++- drivers/md/raid1.c | 10 +++++++++- drivers/md/raid10.c | 14 ++++++++++++-- drivers/md/raid5.c | 10 +++++++++- include/linux/raid/md_k.h | 4 +--- 6 files changed, 44 insertions(+), 19 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 295be1a6880..51c19f86ff9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5434,7 +5434,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -5690,7 +5690,7 @@ void md_do_sync(mddev_t *mddev) sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto out; } @@ -5713,8 +5713,7 @@ void md_do_sync(mddev_t *mddev) last_check = io_sectors; - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || - test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; repeat: @@ -5768,8 +5767,7 @@ void md_do_sync(mddev_t *mddev) /* tell personality that we are finished */ mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && + if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5838,7 +5836,10 @@ static int remove_and_add_spares(mddev_t *mddev) } if (mddev->degraded) { - rdev_for_each(rdev, rtmp, mddev) + rdev_for_each(rdev, rtmp, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags)) + spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; @@ -5856,6 +5857,7 @@ static int remove_and_add_spares(mddev_t *mddev) } else break; } + } } return spares; } @@ -5869,7 +5871,7 @@ static int remove_and_add_spares(mddev_t *mddev) * to do that as needed. * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in * "->recovery" and create a thread at ->sync_thread. - * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * When the thread finishes it sets MD_RECOVERY_DONE * and wakeups up this thread which will reap the thread and finish up. * This thread also removes any faulty devices (with nr_pending == 0). * @@ -5944,8 +5946,7 @@ void md_check_recovery(mddev_t *mddev) /* resync has finished, collect result */ md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* success...*/ /* activate any spares */ mddev->pers->spare_active(mddev); @@ -5969,7 +5970,6 @@ void md_check_recovery(mddev_t *mddev) * might be left set */ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - clear_bit(MD_RECOVERY_ERR, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4f4d1f38384..e968116e0de 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -327,7 +327,8 @@ static int multipath_remove_disk(mddev_t *mddev, int number) if (rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { - printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); + printk(KERN_ERR "hot-remove-disk, slot %d is identified" + " but is still operational!\n", number); err = -EBUSY; goto abort; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d0f4021bbc2..c610b947218 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,7 +1027,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1148,6 +1148,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8536ede1e71..1de17da34a9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1020,7 +1020,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1171,6 +1171,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove faulty devices in recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + enough(conf)) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1237,6 +1245,7 @@ static void end_sync_write(struct bio *bio, int error) if (!uptodate) md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); while (atomic_dec_and_test(&r10_bio->remaining)) { @@ -1844,7 +1853,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", mdname(mddev)); break; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2f28745dacf..425958a76b8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1268,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT @@ -4574,6 +4574,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index a6d7ab688ed..3dea9f545c8 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -188,8 +188,7 @@ struct mddev_s * NEEDED: we might need to start a resync/recover * RUNNING: a thread is running, or about to be started * SYNC: actually doing a resync, not a recovery - * ERR: and IO error was detected - abort the resync/recovery - * INTR: someone requested a (clean) early abort. + * INTR: resync needs to be aborted for some reason * DONE: thread is done and is waiting to be reaped * REQUEST: user-space has requested a sync (used with SYNC) * CHECK: user-space request for for check-only, no repair @@ -199,7 +198,6 @@ struct mddev_s */ #define MD_RECOVERY_RUNNING 0 #define MD_RECOVERY_SYNC 1 -#define MD_RECOVERY_ERR 2 #define MD_RECOVERY_INTR 3 #define MD_RECOVERY_DONE 4 #define MD_RECOVERY_NEEDED 5 -- cgit v1.2.3-70-g09d2 From e0a115e5aa554b93150a8dc1c3fe15467708abb2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:52 -0700 Subject: md: fix prexor vs sync_request race During the initial array synchronization process there is a window between when a prexor operation is scheduled to a specific stripe and when it completes for a sync_request to be scheduled to the same stripe. When this happens the prexor completes and the stripe is unconditionally marked "insync", effectively canceling the sync_request for the stripe. Prior to 2.6.23 this was not a problem because the prexor operation was done under sh->lock. The effect in older kernels being that the prexor would still erroneously mark the stripe "insync", but sync_request would be held off and re-mark the stripe as "!in_sync". Change the write completion logic to not mark the stripe "in_sync" if a prexor was performed. The effect of the change is to sometimes not set STRIPE_INSYNC. The worst this can do is cause the resync to stall waiting for STRIPE_INSYNC to be set. If this were happening, then STRIPE_SYNCING would be set and handle_issuing_new_read_requests would cause all available blocks to eventually be read, at which point prexor would never be used on that stripe any more and STRIPE_INSYNC would eventually be set. echo repair > /sys/block/mdN/md/sync_action will correct arrays that may have lost this race. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 425958a76b8..f0f0585c107 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2645,6 +2645,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2774,9 +2775,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2810,6 +2813,8 @@ static void handle_stripe5(struct stripe_head *sh) if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.3-70-g09d2 From c337869d95011495fa181536786e74aa2d7ff031 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:54 -0700 Subject: md: do not compute parity unless it is on a failed drive If a block is computed (rather than read) then a check/repair operation may be lead to believe that the data on disk is correct, when infact it isn't. So only compute blocks for failed devices. This issue has been around since at least 2.6.12, but has become harder to hit in recent kernels since most reads bypass the cache. echo repair > /sys/block/mdN/md/sync_action will set the parity blocks to the correct state. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f0f0585c107..c37e256b117 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2002,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2087,7 +2088,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); -- cgit v1.2.3-70-g09d2 From 8c2e870a625bd336b2e7a65a97c1836acef07322 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:30:52 +1000 Subject: Ensure interrupted recovery completed properly (v1 metadata plus bitmap) If, while assembling an array, we find a device which is not fully in-sync with the array, it is important to set the "fullsync" flags. This is an exact analog to the setting of this flag in hot_add_disk methods. Currently, only v1.x metadata supports having devices in an array which are not fully in-sync (it keep track of how in sync they are). The 'fullsync' flag only makes a difference when a write-intent bitmap is being used. In this case it tells recovery to ignore the bitmap and recovery all blocks. This fix is already in place for raid1, but not raid5/6 or raid10. So without this fix, a raid1 ir raid4/5/6 array with version 1.x metadata and a write intent bitmaps, that is stopped in the middle of a recovery, will appear to complete the recovery instantly after it is reassembled, but the recovery will not be correct. If you might have an array like that, issueing echo repair > /sys/block/mdXX/md/sync_action will make sure recovery completes properly. Cc: Signed-off-by: Neil Brown --- drivers/md/raid10.c | 2 ++ drivers/md/raid5.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1de17da34a9..a71277b640a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2137,6 +2137,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c37e256b117..475fba4d371 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4305,7 +4305,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* -- cgit v1.2.3-70-g09d2 From efe311431869b40d67911820a309f9a1a41306f3 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:14 +1000 Subject: Don't acknowlege that stripe-expand is complete until it really is. We shouldn't acknowledge that a stripe has been expanded (When reshaping a raid5 by adding a device) until the moved data has actually been written out. However we are currently acknowledging (by calling md_done_sync) when the POST_XOR is complete and before the write. So track in s.locked whether there are pending writes, and don't call md_done_sync yet if there are. Note: we all set R5_LOCKED on devices which are are about to read from. This probably isn't technically necessary, but is usually done when writing a block, and justifies the use of s.locked here. This bug can lead to a crash if an array is stopped while an reshape is in progress. Cc: Signed-off-by: Neil Brown --- drivers/md/raid5.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 475fba4d371..54c8ee28fcc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2898,6 +2898,8 @@ static void handle_stripe5(struct stripe_head *sh) for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; } @@ -2911,6 +2913,7 @@ static void handle_stripe5(struct stripe_head *sh) conf->raid_disks); s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded && + s.locked == 0 && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); -- cgit v1.2.3-70-g09d2 From 7a1fc53c5adb910751a9b212af90302eb4ffb527 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Jul 2008 04:54:57 -0700 Subject: md: ensure all blocks are uptodate or locked when syncing Remove the dubious attempt to prefer 'compute' over 'read'. Not only is it wrong given commit c337869d (md: do not compute parity unless it is on a failed drive), but it can trigger a BUG_ON in handle_parity_checks5(). Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54c8ee28fcc..3b27df52456 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2017,12 +2017,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ s->uptodate++; return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) -- cgit v1.2.3-70-g09d2