From a362357b6cd62643d4dda3b152639303d78473da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Nov 2005 09:26:16 +0100 Subject: [BLOCK] Unify the seperate read/write io stat fields into arrays Instead of having ->read_sectors and ->write_sectors, combine the two into ->sectors[2] and similar for the other fields. This saves a branch several places in the io path, since we don't have to care for what the actual io direction is. On my x86-64 box, that's 200 bytes less text in just the core (not counting the various drivers). Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4683ca24c04..6497295ebfb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1462,6 +1462,7 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; + const int rw = bio_data_dir(bi); if (unlikely(bio_barrier(bi))) { bio_endio(bi, bi->bi_size, -EOPNOTSUPP); @@ -1470,13 +1471,8 @@ static int make_request (request_queue_t *q, struct bio * bi) md_write_start(mddev, bi); - if (bio_data_dir(bi)==WRITE) { - disk_stat_inc(mddev->gendisk, writes); - disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); - } else { - disk_stat_inc(mddev->gendisk, reads); - disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); - } + disk_stat_inc(mddev->gendisk, ios[rw]); + disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); -- cgit v1.2.3-70-g09d2 From 66c006a55137cc51f8761549e2024a7593180d27 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Mon, 7 Nov 2005 01:01:17 -0800 Subject: [PATCH] drivers/md: fix-up schedule_timeout() usage Use schedule_timeout_interruptible() instead of set_current_state()/schedule_timeout() to reduce kernel size. Signed-off-by: Nishanth Aravamudan Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 3 +-- drivers/md/raid6main.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6497295ebfb..1223e98ecd7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1587,8 +1587,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(1); + schedule_timeout_uninterruptible(1); } bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 6437a95ffc1..77578694770 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1746,8 +1746,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(1); + schedule_timeout_uninterruptible(1); } bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); -- cgit v1.2.3-70-g09d2 From 4e5314b56a7ea11c7a5f2b8418992b2f49648a25 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:22 -0800 Subject: [PATCH] md: better handling of readerrors with raid5. This patch changes the behaviour of raid5 when it gets a read error. Instead of just failing the device, it tried to find out what should have been there, and writes it over the bad block. For some media-errors, this has a reasonable chance of fixing the error. If the write succeeds, and a subsequent read succeeds as well, raid5 decided the address is OK and conitnues. Instead of failing a drive on read-error, we attempt to re-write the block, and then re-read. If that all works, we allow the device to remain in the array. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 61 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/raid/raid5.h | 2 ++ 2 files changed, 58 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1223e98ecd7..8cf1ae8b8a7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -349,7 +349,7 @@ static void shrink_stripes(raid5_conf_t *conf) conf->slab_cache = NULL; } -static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done, +static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error) { struct stripe_head *sh = bi->bi_private; @@ -401,10 +401,27 @@ static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done, } #else set_bit(R5_UPTODATE, &sh->dev[i].flags); -#endif +#endif + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + printk("R5: read error corrected!!\n"); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + } } else { - md_error(conf->mddev, conf->disks[i].rdev); clear_bit(R5_UPTODATE, &sh->dev[i].flags); + if (conf->mddev->degraded) { + printk("R5: read error not correctable.\n"); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + md_error(conf->mddev, conf->disks[i].rdev); + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { + /* Oh, no!!! */ + printk("R5: read error NOT corrected!!\n"); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + md_error(conf->mddev, conf->disks[i].rdev); + } else + set_bit(R5_ReadError, &sh->dev[i].flags); } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); #if 0 @@ -966,6 +983,12 @@ static void handle_stripe(struct stripe_head *sh) if (dev->written) written++; rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ if (!rdev || !rdev->in_sync) { + /* The ReadError flag wil just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (!rdev || !rdev->in_sync + || test_bit(R5_ReadError, &dev->flags)) { failed++; failed_num = i; } else @@ -980,6 +1003,14 @@ static void handle_stripe(struct stripe_head *sh) if (failed > 1 && to_read+to_write+written) { for (i=disks; i--; ) { int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && rdev->in_sync) + /* multiple read failures in one stripe */ + md_error(conf->mddev, rdev); + } + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; @@ -1015,7 +1046,8 @@ static void handle_stripe(struct stripe_head *sh) } /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags)) { + if (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags)) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) @@ -1274,7 +1306,26 @@ static void handle_stripe(struct stripe_head *sh) md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } - + + /* If the failed drive is just a ReadError, then we might need to progress + * the repair/check process + */ + if (failed == 1 && test_bit(R5_ReadError, &sh->dev[failed_num].flags) + && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) + && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) + ) { + dev = &sh->dev[failed_num]; + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } else { + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } + } + spin_unlock(&sh->lock); while ((bi=return_bi)) { diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 176fc653c28..f025ba6fb14 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -154,6 +154,8 @@ struct stripe_head { #define R5_Wantwrite 5 #define R5_Syncio 6 /* this io need to be accounted as resync io */ #define R5_Overlap 7 /* There is a pending overlapping request on this block */ +#define R5_ReadError 8 /* seen a read error here recently */ +#define R5_ReWrite 9 /* have tried to over-write the readerror */ /* * Write method -- cgit v1.2.3-70-g09d2 From 3f294f4fb6f2ba887b717674da26c21f3d57f3fc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:25 -0800 Subject: [PATCH] md: add kobject/sysfs support to raid5 /sys/block/mdX/md/raid5/ contains raid5-related attributes. Currently stripe_cache_size is number of entries in stripe cache, and is settable. stripe_cache_active is number of active entries, and in only readable. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 183 +++++++++++++++++++++++++++++++++++++-------- include/linux/raid/raid5.h | 1 + 2 files changed, 152 insertions(+), 32 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8cf1ae8b8a7..121fbaa9ed5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -293,9 +293,31 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } -static int grow_stripes(raid5_conf_t *conf, int num) +static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; + sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); + if (!sh) + return 0; + memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); + sh->raid_conf = conf; + spin_lock_init(&sh->lock); + + if (grow_buffers(sh, conf->raid_disks)) { + shrink_buffers(sh, conf->raid_disks); + kmem_cache_free(conf->slab_cache, sh); + return 0; + } + /* we just created an active stripe so... */ + atomic_set(&sh->count, 1); + atomic_inc(&conf->active_stripes); + INIT_LIST_HEAD(&sh->lru); + release_stripe(sh); + return 1; +} + +static int grow_stripes(raid5_conf_t *conf, int num) +{ kmem_cache_t *sc; int devs = conf->raid_disks; @@ -308,43 +330,34 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 1; conf->slab_cache = sc; while (num--) { - sh = kmem_cache_alloc(sc, GFP_KERNEL); - if (!sh) - return 1; - memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); - sh->raid_conf = conf; - spin_lock_init(&sh->lock); - - if (grow_buffers(sh, conf->raid_disks)) { - shrink_buffers(sh, conf->raid_disks); - kmem_cache_free(sc, sh); + if (!grow_one_stripe(conf)) return 1; - } - /* we just created an active stripe so... */ - atomic_set(&sh->count, 1); - atomic_inc(&conf->active_stripes); - INIT_LIST_HEAD(&sh->lru); - release_stripe(sh); } return 0; } -static void shrink_stripes(raid5_conf_t *conf) +static int drop_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - while (1) { - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); - if (!sh) - break; - if (atomic_read(&sh->count)) - BUG(); - shrink_buffers(sh, conf->raid_disks); - kmem_cache_free(conf->slab_cache, sh); - atomic_dec(&conf->active_stripes); - } + spin_lock_irq(&conf->device_lock); + sh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); + if (!sh) + return 0; + if (atomic_read(&sh->count)) + BUG(); + shrink_buffers(sh, conf->raid_disks); + kmem_cache_free(conf->slab_cache, sh); + atomic_dec(&conf->active_stripes); + return 1; +} + +static void shrink_stripes(raid5_conf_t *conf) +{ + while (drop_one_stripe(conf)) + ; + kmem_cache_destroy(conf->slab_cache); conf->slab_cache = NULL; } @@ -1714,6 +1727,108 @@ static void raid5d (mddev_t *mddev) PRINTK("--- raid5d inactive\n"); } +struct raid5_sysfs_entry { + struct attribute attr; + ssize_t (*show)(raid5_conf_t *, char *); + ssize_t (*store)(raid5_conf_t *, const char *, ssize_t); +}; + +static ssize_t +raid5_show_stripe_cache_size(raid5_conf_t *conf, char *page) +{ + return sprintf(page, "%d\n", conf->max_nr_stripes); +} + +static ssize_t +raid5_store_stripe_cache_size(raid5_conf_t *conf, const char *page, ssize_t len) +{ + char *end; + int new; + if (len >= PAGE_SIZE) + return -EINVAL; + + new = simple_strtoul(page, &end, 10); + if (!*page || (*end && *end != '\n') ) + return -EINVAL; + if (new <= 16 || new > 32768) + return -EINVAL; + while (new < conf->max_nr_stripes) { + if (drop_one_stripe(conf)) + conf->max_nr_stripes--; + else + break; + } + while (new > conf->max_nr_stripes) { + if (grow_one_stripe(conf)) + conf->max_nr_stripes++; + else break; + } + return len; +} +static struct raid5_sysfs_entry raid5_stripecache_size = { + .attr = {.name = "stripe_cache_size", .mode = S_IRUGO | S_IWUSR }, + .show = raid5_show_stripe_cache_size, + .store = raid5_store_stripe_cache_size, +}; + +static ssize_t +raid5_show_stripe_cache_active(raid5_conf_t *conf, char *page) +{ + return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); +} + +static struct raid5_sysfs_entry raid5_stripecache_active = { + .attr = {.name = "stripe_cache_active", .mode = S_IRUGO}, + .show = raid5_show_stripe_cache_active, +}; + +static struct attribute *raid5_default_attrs[] = { + &raid5_stripecache_size.attr, + &raid5_stripecache_active.attr, + NULL, +}; + +static ssize_t +raid5_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct raid5_sysfs_entry *entry = container_of(attr, struct raid5_sysfs_entry, attr); + raid5_conf_t *conf = container_of(kobj, raid5_conf_t, kobj); + + if (!entry->show) + return -EIO; + return entry->show(conf, page); +} + +static ssize_t +raid5_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct raid5_sysfs_entry *entry = container_of(attr, struct raid5_sysfs_entry, attr); + raid5_conf_t *conf = container_of(kobj, raid5_conf_t, kobj); + + if (!entry->store) + return -EIO; + return entry->store(conf, page, length); +} + +static void raid5_free(struct kobject *ko) +{ + raid5_conf_t *conf = container_of(ko, raid5_conf_t, kobj); + kfree(conf); +} + + +static struct sysfs_ops raid5_sysfs_ops = { + .show = raid5_attr_show, + .store = raid5_attr_store, +}; + +static struct kobj_type raid5_ktype = { + .release = raid5_free, + .sysfs_ops = &raid5_sysfs_ops, + .default_attrs = raid5_default_attrs, +}; + static int run(mddev_t *mddev) { raid5_conf_t *conf; @@ -1855,6 +1970,10 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + } /* Ok, everything is just fine now */ + conf->kobj.parent = kobject_get(&mddev->kobj); + strcpy(conf->kobj.name, "raid5"); + conf->kobj.ktype = &raid5_ktype; + kobject_register(&conf->kobj); if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; @@ -1879,7 +1998,7 @@ abort: -static int stop (mddev_t *mddev) +static int stop(mddev_t *mddev) { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; @@ -1888,7 +2007,7 @@ static int stop (mddev_t *mddev) shrink_stripes(conf); free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf); + kobject_unregister(&conf->kobj); mddev->private = NULL; return 0; } diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index f025ba6fb14..5f4e945c808 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -228,6 +228,7 @@ struct raid5_private_data { * Cleared when a sync completes. */ + struct kobject kobj; /* * Free stripes pool */ -- cgit v1.2.3-70-g09d2 From 9d88883e68f404d5581bd391713ceef470ea53a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:26 -0800 Subject: [PATCH] md: teach raid5 the difference between 'check' and 'repair'. With this, raid5 can be asked to check parity without repairing it. It also keeps a count of the number of incorrect parity blocks found (mismatches) and reports them through sysfs. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 18 ++++++++++++++++-- drivers/md/raid5.c | 5 +++++ include/linux/raid/md_k.h | 4 ++++ 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 37400873b87..e58d61d9f31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1758,16 +1758,29 @@ md_store_scan(mddev_t *mddev, const char *page, size_t len) return len; } +static ssize_t +md_show_mismatch(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) mddev->resync_mismatches); +} + static struct md_sysfs_entry md_scan_mode = { .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, .show = md_show_scan, .store = md_store_scan, }; +static struct md_sysfs_entry md_mismatches = { + .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, + .show = md_show_mismatch, +}; + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_raid_disks.attr, &md_scan_mode.attr, + &md_mismatches.attr, NULL, }; @@ -3888,12 +3901,13 @@ static void md_do_sync(mddev_t *mddev) } } while (mddev->curr_resync < 2); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - else + mddev->resync_mismatches = 0; + } else /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 121fbaa9ed5..ce154553aca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1292,6 +1292,11 @@ static void handle_stripe(struct stripe_head *sh) !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); + } else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); } } if (!test_bit(STRIPE_INSYNC, &sh->state)) { diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index efd04dca0ab..cb8b44d1588 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -175,6 +175,10 @@ struct mddev_s sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t resync_max_sectors; /* may be set by personality */ + + sector_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ /* recovery/resync flags * NEEDED: we might need to start a resync/recover * RUNNING: a thread is running, or about to be started -- cgit v1.2.3-70-g09d2 From d6065f7bf8bec170c9c56524a250093ce73ca5d9 Mon Sep 17 00:00:00 2001 From: Suzanne Wood Date: Tue, 8 Nov 2005 21:39:27 -0800 Subject: [PATCH] md: provide proper rcu_dereference / rcu_assign_pointer annotations in md Acked-by: Signed-off-by: Suzanne Wood Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/multipath.c | 8 ++++---- drivers/md/raid1.c | 20 ++++++++++---------- drivers/md/raid10.c | 30 ++++++++++++++++-------------- drivers/md/raid5.c | 8 ++++---- drivers/md/raid6main.c | 8 ++++---- 5 files changed, 38 insertions(+), 36 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c06f4474192..ae2c5fd6105 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -63,7 +63,7 @@ static int multipath_map (multipath_conf_t *conf) rcu_read_lock(); for (i = 0; i < disks; i++) { - mdk_rdev_t *rdev = conf->multipaths[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && rdev->in_sync) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); @@ -139,7 +139,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = conf->multipaths[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); @@ -224,7 +224,7 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = conf->multipaths[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -331,7 +331,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) conf->working_disks++; rdev->raid_disk = path; rdev->in_sync = 1; - p->rdev = rdev; + rcu_assign_pointer(p->rdev, rdev); found = 1; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e16f473bcf4..f12fc288f25 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -416,10 +416,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* Choose the first operation device, for consistancy */ new_disk = 0; - for (rdev = conf->mirrors[new_disk].rdev; + for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); !rdev || !rdev->in_sync || test_bit(WriteMostly, &rdev->flags); - rdev = conf->mirrors[++new_disk].rdev) { + rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { if (rdev && rdev->in_sync) wonly_disk = new_disk; @@ -434,10 +434,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ - for (rdev = conf->mirrors[new_disk].rdev; + for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); !rdev || !rdev->in_sync || test_bit(WriteMostly, &rdev->flags); - rdev = conf->mirrors[new_disk].rdev) { + rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { if (rdev && rdev->in_sync) wonly_disk = new_disk; @@ -474,7 +474,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) disk = conf->raid_disks; disk--; - rdev = conf->mirrors[disk].rdev; + rdev = rcu_dereference(conf->mirrors[disk].rdev); if (!rdev || !rdev->in_sync || @@ -496,7 +496,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) if (new_disk >= 0) { - rdev = conf->mirrors[new_disk].rdev; + rdev = rcu_dereference(conf->mirrors[new_disk].rdev); if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); @@ -522,7 +522,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); @@ -556,7 +556,7 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -728,7 +728,7 @@ static int make_request(request_queue_t *q, struct bio * bio) #endif rcu_read_lock(); for (i = 0; i < disks; i++) { - if ((rdev=conf->mirrors[i].rdev) != NULL && + if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && !rdev->faulty) { atomic_inc(&rdev->nr_pending); if (rdev->faulty) { @@ -954,7 +954,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) found = 1; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; - p->rdev = rdev; + rcu_assign_pointer(p->rdev, rdev); break; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index bbe40e9cf92..26114f40bde 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -496,6 +496,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) int disk, slot, nslot; const int sectors = r10_bio->sectors; sector_t new_distance, current_distance; + mdk_rdev_t *rdev; raid10_find_phys(conf, r10_bio); rcu_read_lock(); @@ -510,8 +511,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) slot = 0; disk = r10_bio->devs[slot].devnum; - while (!conf->mirrors[disk].rdev || - !conf->mirrors[disk].rdev->in_sync) { + while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || + !rdev->in_sync) { slot++; if (slot == conf->copies) { slot = 0; @@ -527,8 +528,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) /* make sure the disk is operational */ slot = 0; disk = r10_bio->devs[slot].devnum; - while (!conf->mirrors[disk].rdev || - !conf->mirrors[disk].rdev->in_sync) { + while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || + !rdev->in_sync) { slot ++; if (slot == conf->copies) { disk = -1; @@ -547,11 +548,11 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) int ndisk = r10_bio->devs[nslot].devnum; - if (!conf->mirrors[ndisk].rdev || - !conf->mirrors[ndisk].rdev->in_sync) + if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || + !rdev->in_sync) continue; - if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { + if (!atomic_read(&rdev->nr_pending)) { disk = ndisk; slot = nslot; break; @@ -569,7 +570,7 @@ rb_out: r10_bio->read_slot = slot; /* conf->next_seq_sect = this_sector + sectors;*/ - if (disk >= 0 && conf->mirrors[disk].rdev) + if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) atomic_inc(&conf->mirrors[disk].rdev->nr_pending); rcu_read_unlock(); @@ -583,7 +584,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); @@ -614,7 +615,7 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -768,9 +769,10 @@ static int make_request(request_queue_t *q, struct bio * bio) rcu_read_lock(); for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - if (conf->mirrors[d].rdev && - !conf->mirrors[d].rdev->faulty) { - atomic_inc(&conf->mirrors[d].rdev->nr_pending); + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + !rdev->faulty) { + atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; } else r10_bio->devs[i].bio = NULL; @@ -980,7 +982,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; found = 1; - p->rdev = rdev; + rcu_assign_pointer(p->rdev, rdev); break; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ce154553aca..061d265ed94 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1374,7 +1374,7 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_end_io = raid5_end_read_request; rcu_read_lock(); - rdev = conf->disks[i].rdev; + rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && rdev->faulty) rdev = NULL; if (rdev) @@ -1448,7 +1448,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = conf->disks[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); @@ -1493,7 +1493,7 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = conf->disks[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -2165,7 +2165,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) found = 1; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; - p->rdev = rdev; + rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 77578694770..84f3ee01e4c 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1464,7 +1464,7 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_end_io = raid6_end_read_request; rcu_read_lock(); - rdev = conf->disks[i].rdev; + rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && rdev->faulty) rdev = NULL; if (rdev) @@ -1538,7 +1538,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = conf->disks[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); @@ -1583,7 +1583,7 @@ static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = conf->disks[i].rdev; + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -2154,7 +2154,7 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) found = 1; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; - p->rdev = rdev; + rcu_assign_pointer(p->rdev, rdev); break; } print_raid6_conf(conf); -- cgit v1.2.3-70-g09d2 From 9c79197761b4c181a143dc6a6044f4e47d44bdcc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:28 -0800 Subject: [PATCH] md: fix ref-counting problems with kobjects in md Thanks Greg. Cc: Greg KH Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 8 ++++++-- drivers/md/raid5.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index e58d61d9f31..fe0137a5b00 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -805,7 +805,11 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (fixdesc & (1<desc_nr)) { snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev2->desc_nr); + /* kobject_add gets a ref on the parent, so + * we have to drop the one we already have + */ kobject_add(&rdev2->kobj); + kobject_put(rdev->kobj.parent); sysfs_create_link(&rdev2->kobj, &rdev2->bdev->bd_disk->kobj, "block"); @@ -1178,7 +1182,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) rdev->kobj.k_name = NULL; snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); - rdev->kobj.parent = kobject_get(&mddev->kobj); + rdev->kobj.parent = &mddev->kobj; kobject_add(&rdev->kobj); sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); @@ -1864,7 +1868,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) add_disk(disk); mddev->gendisk = disk; up(&disks_sem); - mddev->kobj.parent = kobject_get(&disk->kobj); + mddev->kobj.parent = &disk->kobj; mddev->kobj.k_name = NULL; snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); mddev->kobj.ktype = &md_ktype; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 061d265ed94..246c9b1cc4a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1975,7 +1975,7 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + } /* Ok, everything is just fine now */ - conf->kobj.parent = kobject_get(&mddev->kobj); + conf->kobj.parent = &mddev->kobj; strcpy(conf->kobj.name, "raid5"); conf->kobj.ktype = &raid5_ktype; kobject_register(&conf->kobj); -- cgit v1.2.3-70-g09d2 From 007583c9253fed363a0bd71b039e9b40a0f6855e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:30 -0800 Subject: [PATCH] md: change raid5 sysfs attribute to not create a new directory There isn't really a need for raid5 attributes to be an a subdirectory, so this patch moves them from /sys/block/mdX/md/raid5/attribute to /sys/block/mdX/md/attribute This suggests that all md personalities should co-operate about namespace usage, but that shouldn't be a problem. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 6 ---- drivers/md/raid5.c | 72 ++++++++++------------------------------------ include/linux/raid/md_k.h | 7 +++++ include/linux/raid/raid5.h | 1 - 4 files changed, 22 insertions(+), 64 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 013f2f27589..3db5c351307 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1683,12 +1683,6 @@ static void analyze_sbs(mddev_t * mddev) } -struct md_sysfs_entry { - struct attribute attr; - ssize_t (*show)(mddev_t *, char *); - ssize_t (*store)(mddev_t *, const char *, size_t); -}; - static ssize_t md_show_level(mddev_t *mddev, char *page) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 246c9b1cc4a..08a1620b9f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1732,21 +1732,17 @@ static void raid5d (mddev_t *mddev) PRINTK("--- raid5d inactive\n"); } -struct raid5_sysfs_entry { - struct attribute attr; - ssize_t (*show)(raid5_conf_t *, char *); - ssize_t (*store)(raid5_conf_t *, const char *, ssize_t); -}; - static ssize_t -raid5_show_stripe_cache_size(raid5_conf_t *conf, char *page) +raid5_show_stripe_cache_size(mddev_t *mddev, char *page) { + raid5_conf_t *conf = mddev_to_conf(mddev); return sprintf(page, "%d\n", conf->max_nr_stripes); } static ssize_t -raid5_store_stripe_cache_size(raid5_conf_t *conf, const char *page, ssize_t len) +raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { + raid5_conf_t *conf = mddev_to_conf(mddev); char *end; int new; if (len >= PAGE_SIZE) @@ -1770,68 +1766,33 @@ raid5_store_stripe_cache_size(raid5_conf_t *conf, const char *page, ssize_t len) } return len; } -static struct raid5_sysfs_entry raid5_stripecache_size = { + +static struct md_sysfs_entry raid5_stripecache_size = { .attr = {.name = "stripe_cache_size", .mode = S_IRUGO | S_IWUSR }, .show = raid5_show_stripe_cache_size, .store = raid5_store_stripe_cache_size, }; static ssize_t -raid5_show_stripe_cache_active(raid5_conf_t *conf, char *page) +raid5_show_stripe_cache_active(mddev_t *mddev, char *page) { + raid5_conf_t *conf = mddev_to_conf(mddev); return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); } -static struct raid5_sysfs_entry raid5_stripecache_active = { +static struct md_sysfs_entry raid5_stripecache_active = { .attr = {.name = "stripe_cache_active", .mode = S_IRUGO}, .show = raid5_show_stripe_cache_active, }; -static struct attribute *raid5_default_attrs[] = { +static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, NULL, }; - -static ssize_t -raid5_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - struct raid5_sysfs_entry *entry = container_of(attr, struct raid5_sysfs_entry, attr); - raid5_conf_t *conf = container_of(kobj, raid5_conf_t, kobj); - - if (!entry->show) - return -EIO; - return entry->show(conf, page); -} - -static ssize_t -raid5_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct raid5_sysfs_entry *entry = container_of(attr, struct raid5_sysfs_entry, attr); - raid5_conf_t *conf = container_of(kobj, raid5_conf_t, kobj); - - if (!entry->store) - return -EIO; - return entry->store(conf, page, length); -} - -static void raid5_free(struct kobject *ko) -{ - raid5_conf_t *conf = container_of(ko, raid5_conf_t, kobj); - kfree(conf); -} - - -static struct sysfs_ops raid5_sysfs_ops = { - .show = raid5_attr_show, - .store = raid5_attr_store, -}; - -static struct kobj_type raid5_ktype = { - .release = raid5_free, - .sysfs_ops = &raid5_sysfs_ops, - .default_attrs = raid5_default_attrs, +static struct attribute_group raid5_attrs_group = { + .name = NULL, + .attrs = raid5_attrs, }; static int run(mddev_t *mddev) @@ -1975,10 +1936,7 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + } /* Ok, everything is just fine now */ - conf->kobj.parent = &mddev->kobj; - strcpy(conf->kobj.name, "raid5"); - conf->kobj.ktype = &raid5_ktype; - kobject_register(&conf->kobj); + sysfs_create_group(&mddev->kobj, &raid5_attrs_group); if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; @@ -2012,7 +1970,7 @@ static int stop(mddev_t *mddev) shrink_stripes(conf); free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kobject_unregister(&conf->kobj); + sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); mddev->private = NULL; return 0; } diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index cb8b44d1588..4169c11e545 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -282,6 +282,13 @@ struct mdk_personality_s }; +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mddev_t *, char *); + ssize_t (*store)(mddev_t *, const char *, size_t); +}; + + static inline char * mdname (mddev_t * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 5f4e945c808..f025ba6fb14 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -228,7 +228,6 @@ struct raid5_private_data { * Cleared when a sync completes. */ - struct kobject kobj; /* * Free stripes pool */ -- cgit v1.2.3-70-g09d2 From ba22dcbf106338a5c46d6979f9b19564faae3d49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:31 -0800 Subject: [PATCH] md: improvements to raid5 handling of read errors Two refinements to the 'attempt-overwrite-on-read-error' mechanism. 1/ If the array is read-only, don't attempt an over-write. 2/ If there are more than max_nr_stripes read errors on a device with no success, fail the drive. This will make sure a dead drive will be eventually kicked even when we aren't trying to rewrite (which would normally kick a dead drive more quickly. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 1 + drivers/md/raid5.c | 25 +++++++++++++++++-------- include/linux/raid/md_k.h | 3 +++ 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db5c351307..3fb80397f8a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1582,6 +1582,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->in_sync = 0; rdev->data_offset = 0; atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; if (!size) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 08a1620b9f8..77610b98d4e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -420,21 +420,29 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } + if (atomic_read(&conf->disks[i].rdev->read_errors)) + atomic_set(&conf->disks[i].rdev->read_errors, 0); } else { + int retry = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); - if (conf->mddev->degraded) { + atomic_inc(&conf->disks[i].rdev->read_errors); + if (conf->mddev->degraded) printk("R5: read error not correctable.\n"); - clear_bit(R5_ReadError, &sh->dev[i].flags); - clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, conf->disks[i].rdev); - } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { + else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ printk("R5: read error NOT corrected!!\n"); + else if (atomic_read(&conf->disks[i].rdev->read_errors) + > conf->max_nr_stripes) + printk("raid5: Too many read errors, failing device.\n"); + else + retry = 1; + if (retry) + set_bit(R5_ReadError, &sh->dev[i].flags); + else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); md_error(conf->mddev, conf->disks[i].rdev); - } else - set_bit(R5_ReadError, &sh->dev[i].flags); + } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); #if 0 @@ -1328,7 +1336,8 @@ static void handle_stripe(struct stripe_head *sh) /* If the failed drive is just a ReadError, then we might need to progress * the repair/check process */ - if (failed == 1 && test_bit(R5_ReadError, &sh->dev[failed_num].flags) + if (failed == 1 && ! conf->mddev->ro && + test_bit(R5_ReadError, &sh->dev[failed_num].flags) && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) ) { diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4169c11e545..200c69e34fc 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -134,6 +134,9 @@ struct mdk_rdev_s * only maintained for arrays that * support hot removal */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ }; typedef struct mdk_personality_s mdk_personality_t; -- cgit v1.2.3-70-g09d2 From b2d444d7ad975d555bb919601bcdc0e58975a40e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:31 -0800 Subject: [PATCH] md: convert 'faulty' and 'in_sync' fields to bits in 'flags' field This has the advantage of removing the confusion caused by 'rdev_t' and 'mddev_t' both having 'in_sync' fields. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/bitmap.c | 6 ++-- drivers/md/md.c | 92 +++++++++++++++++++++++------------------------ drivers/md/multipath.c | 23 ++++++------ drivers/md/raid1.c | 52 +++++++++++++-------------- drivers/md/raid10.c | 41 ++++++++++----------- drivers/md/raid5.c | 36 +++++++++---------- drivers/md/raid6main.c | 32 ++++++++--------- include/linux/raid/md_k.h | 8 ++--- 8 files changed, 146 insertions(+), 144 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index e59694bc575..c5fa4c2a5af 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -271,7 +271,8 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde return ERR_PTR(-ENOMEM); ITERATE_RDEV(mddev, rdev, tmp) { - if (! rdev->in_sync || rdev->faulty) + if (! test_bit(In_sync, &rdev->flags) + || test_bit(Faulty, &rdev->flags)) continue; target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); @@ -291,7 +292,8 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai struct list_head *tmp; ITERATE_RDEV(mddev, rdev, tmp) - if (rdev->in_sync && !rdev->faulty) + if (test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) md_super_write(mddev, rdev, (rdev->sb_offset<<1) + offset + page->index * (PAGE_SIZE/512), diff --git a/drivers/md/md.c b/drivers/md/md.c index 3fb80397f8a..9dfa063d1c6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -610,7 +610,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); rdev->raid_disk = -1; - rdev->in_sync = 0; + rdev->flags = 0; if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; @@ -671,21 +671,19 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) return 0; if (mddev->level != LEVEL_MULTIPATH) { - rdev->faulty = 0; - rdev->flags = 0; desc = sb->disks + rdev->desc_nr; if (desc->state & (1<faulty = 1; + set_bit(Faulty, &rdev->flags); else if (desc->state & (1<raid_disk < mddev->raid_disks) { - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; } if (desc->state & (1<flags); } else /* MULTIPATH are always insync */ - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); return 0; } @@ -761,7 +759,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ITERATE_RDEV(mddev,rdev2,tmp) { mdp_disk_t *d; int desc_nr; - if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) + && !test_bit(Faulty, &rdev2->flags)) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; @@ -780,14 +779,15 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) + && !test_bit(Faulty, &rdev2->flags)) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ - if (rdev2->faulty) { + if (test_bit(Faulty, &rdev2->flags)) { d->state = (1<in_sync) { + } else if (test_bit(In_sync, &rdev2->flags)) { d->state = (1<state |= (1<sb_page); rdev->raid_disk = -1; - rdev->in_sync = 0; + rdev->flags = 0; if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; @@ -1027,22 +1027,19 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ - rdev->faulty = 0; break; case 0xfffe: /* faulty */ - rdev->faulty = 1; + set_bit(Faulty, &rdev->flags); break; default: - rdev->in_sync = 1; - rdev->faulty = 0; + set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } - rdev->flags = 0; if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); } else /* MULTIPATH are always insync */ - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); return 0; } @@ -1086,9 +1083,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ITERATE_RDEV(mddev,rdev2,tmp) { i = rdev2->desc_nr; - if (rdev2->faulty) + if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); - else if (rdev2->in_sync) + else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); @@ -1327,7 +1324,8 @@ static void print_rdev(mdk_rdev_t *rdev) char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", bdevname(rdev->bdev,b), (unsigned long long)rdev->size, - rdev->faulty, rdev->in_sync, rdev->desc_nr); + test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), + rdev->desc_nr); if (rdev->sb_loaded) { printk(KERN_INFO "md: rdev superblock:\n"); print_sb((mdp_super_t*)page_address(rdev->sb_page)); @@ -1421,11 +1419,11 @@ repeat: ITERATE_RDEV(mddev,rdev,tmp) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) dprintk("(skipping faulty "); dprintk("%s ", bdevname(rdev->bdev,b)); - if (!rdev->faulty) { + if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_offset<<1, rdev->sb_size, rdev->sb_page); @@ -1466,15 +1464,16 @@ rdev_show_state(mdk_rdev_t *rdev, char *page) char *sep = ""; int len=0; - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sin_sync",sep); sep = ","; } - if (!rdev->faulty && !rdev->in_sync) { + if (!test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; } @@ -1578,8 +1577,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj); rdev->desc_nr = -1; - rdev->faulty = 0; - rdev->in_sync = 0; + rdev->flags = 0; rdev->data_offset = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); @@ -1670,7 +1668,7 @@ static void analyze_sbs(mddev_t * mddev) if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); } } @@ -1939,7 +1937,7 @@ static int do_md_run(mddev_t * mddev) /* devices must have minimum size of one chunk */ ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->size < chunk_size / 1024) { printk(KERN_WARNING @@ -1967,7 +1965,7 @@ static int do_md_run(mddev_t * mddev) * Also find largest hardsector size */ ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev, 0); @@ -2304,7 +2302,7 @@ static int autostart_array(dev_t startdev) return err; } - if (start_rdev->faulty) { + if (test_bit(Faulty, &start_rdev->flags)) { printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", bdevname(start_rdev->bdev,b)); @@ -2363,11 +2361,11 @@ static int get_array_info(mddev_t * mddev, void __user * arg) nr=working=active=failed=spare=0; ITERATE_RDEV(mddev,rdev,tmp) { nr++; - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) failed++; else { working++; - if (rdev->in_sync) + if (test_bit(In_sync, &rdev->flags)) active++; else spare++; @@ -2458,9 +2456,9 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.minor = MINOR(rdev->bdev->bd_dev); info.raid_disk = rdev->raid_disk; info.state = 0; - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) info.state |= (1<in_sync) { + else if (test_bit(In_sync, &rdev->flags)) { info.state |= (1<saved_raid_disk = rdev->raid_disk; - rdev->in_sync = 0; /* just to be sure */ + clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<flags); @@ -2591,11 +2589,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->raid_disk = -1; - rdev->faulty = 0; + rdev->flags = 0; + if (rdev->raid_disk < mddev->raid_disks) - rdev->in_sync = (info->state & (1<in_sync = 0; + if (info->state & (1<flags); if (info->state & (1<flags); @@ -2694,14 +2692,14 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) goto abort_export; } - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING "md: can not hot-add faulty %s disk to %s!\n", bdevname(rdev->bdev,b), mdname(mddev)); err = -EINVAL; goto abort_export; } - rdev->in_sync = 0; + clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; bind_rdev_to_array(rdev, mddev); @@ -3428,7 +3426,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) return; } - if (!rdev || rdev->faulty) + if (!rdev || test_bit(Faulty, &rdev->flags)) return; /* dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", @@ -3626,7 +3624,7 @@ static int md_seq_show(struct seq_file *seq, void *v) bdevname(rdev->bdev,b), rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; } else if (rdev->raid_disk < 0) @@ -4174,7 +4172,7 @@ void md_check_recovery(mddev_t *mddev) */ ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk >= 0 && - (rdev->faulty || ! rdev->in_sync) && + (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { char nm[20]; @@ -4187,7 +4185,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->degraded) { ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk < 0 - && !rdev->faulty) { + && !test_bit(Faulty, &rdev->flags)) { if (mddev->pers->hot_add_disk(mddev,rdev)) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -4347,7 +4345,7 @@ static void autostart_arrays(int part) if (IS_ERR(rdev)) continue; - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { MD_BUG(); continue; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index ae2c5fd6105..145cdc5ad00 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -64,7 +64,7 @@ static int multipath_map (multipath_conf_t *conf) rcu_read_lock(); for (i = 0; i < disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && rdev->in_sync) { + if (rdev && test_bit(In_sync, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); return i; @@ -140,7 +140,8 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { + if (rdev && !test_bit(Faulty, &rdev->flags) + && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); @@ -211,7 +212,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->multipaths[i].rdev && - conf->multipaths[i].rdev->in_sync ? "U" : "_"); + test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); } @@ -225,7 +226,7 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !rdev->faulty) { + if (rdev && !test_bit(Faulty, &rdev->flags)) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -265,10 +266,10 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) /* * Mark disk as unusable */ - if (!rdev->faulty) { + if (!test_bit(Faulty, &rdev->flags)) { char b[BDEVNAME_SIZE]; - rdev->in_sync = 0; - rdev->faulty = 1; + clear_bit(In_sync, &rdev->flags); + set_bit(Faulty, &rdev->flags); mddev->sb_dirty = 1; conf->working_disks--; printk(KERN_ALERT "multipath: IO failure on %s," @@ -298,7 +299,7 @@ static void print_multipath_conf (multipath_conf_t *conf) tmp = conf->multipaths + i; if (tmp->rdev) printk(" disk%d, o:%d, dev:%s\n", - i,!tmp->rdev->faulty, + i,!test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); } } @@ -330,7 +331,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) conf->working_disks++; rdev->raid_disk = path; - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); rcu_assign_pointer(p->rdev, rdev); found = 1; } @@ -350,7 +351,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number) rdev = p->rdev; if (rdev) { - if (rdev->in_sync || + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); err = -EBUSY; @@ -482,7 +483,7 @@ static int multipath_run (mddev_t *mddev) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - if (!rdev->faulty) + if (!test_bit(Faulty, &rdev->flags)) conf->working_disks++; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f12fc288f25..fb6b866c28f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -417,11 +417,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) new_disk = 0; for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - !rdev || !rdev->in_sync + !rdev || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags); rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { - if (rdev && rdev->in_sync) + if (rdev && test_bit(In_sync, &rdev->flags)) wonly_disk = new_disk; if (new_disk == conf->raid_disks - 1) { @@ -435,11 +435,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - !rdev || !rdev->in_sync || + !rdev || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags); rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { - if (rdev && rdev->in_sync) + if (rdev && test_bit(In_sync, &rdev->flags)) wonly_disk = new_disk; if (new_disk <= 0) @@ -477,7 +477,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) rdev = rcu_dereference(conf->mirrors[disk].rdev); if (!rdev || - !rdev->in_sync || + !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags)) continue; @@ -500,7 +500,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (!rdev->in_sync) { + if (!test_bit(In_sync, &rdev->flags)) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ @@ -523,7 +523,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { + if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); @@ -557,7 +557,7 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !rdev->faulty) { + if (rdev && !test_bit(Faulty, &rdev->flags)) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -729,9 +729,9 @@ static int make_request(request_queue_t *q, struct bio * bio) rcu_read_lock(); for (i = 0; i < disks; i++) { if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && - !rdev->faulty) { + !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { atomic_dec(&rdev->nr_pending); r1_bio->bios[i] = NULL; } else @@ -824,7 +824,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && - conf->mirrors[i].rdev->in_sync ? "U" : "_"); + test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); seq_printf(seq, "]"); } @@ -840,14 +840,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * next level up know. * else mark the drive as failed */ - if (rdev->in_sync + if (test_bit(In_sync, &rdev->flags) && conf->working_disks == 1) /* * Don't fail the drive, act as though we were just a * normal single drive */ return; - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { mddev->degraded++; conf->working_disks--; /* @@ -855,8 +855,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - rdev->in_sync = 0; - rdev->faulty = 1; + clear_bit(In_sync, &rdev->flags); + set_bit(Faulty, &rdev->flags); mddev->sb_dirty = 1; printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", @@ -881,7 +881,7 @@ static void print_conf(conf_t *conf) tmp = conf->mirrors + i; if (tmp->rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !tmp->rdev->in_sync, !tmp->rdev->faulty, + i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); } } @@ -913,11 +913,11 @@ static int raid1_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; if (tmp->rdev - && !tmp->rdev->faulty - && !tmp->rdev->in_sync) { + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_bit(In_sync, &tmp->rdev->flags)) { conf->working_disks++; mddev->degraded--; - tmp->rdev->in_sync = 1; + set_bit(In_sync, &tmp->rdev->flags); } } @@ -972,7 +972,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) print_conf(conf); rdev = p->rdev; if (rdev) { - if (rdev->in_sync || + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; @@ -1282,11 +1282,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* make sure disk is operational */ wonly = disk; while (conf->mirrors[disk].rdev == NULL || - !conf->mirrors[disk].rdev->in_sync || + !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) || test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) ) { if (conf->mirrors[disk].rdev && - conf->mirrors[disk].rdev->in_sync) + test_bit(In_sync, &conf->mirrors[disk].rdev->flags)) wonly = disk; if (disk <= 0) disk = conf->raid_disks; @@ -1333,10 +1333,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_rw = READ; bio->bi_end_io = end_sync_read; } else if (conf->mirrors[i].rdev == NULL || - conf->mirrors[i].rdev->faulty) { + test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { still_degraded = 1; continue; - } else if (!conf->mirrors[i].rdev->in_sync || + } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) || sector_nr + RESYNC_SECTORS > mddev->recovery_cp) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; @@ -1478,7 +1478,7 @@ static int run(mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!rdev->faulty && rdev->in_sync) + if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) conf->working_disks++; } conf->raid_disks = mddev->raid_disks; @@ -1518,7 +1518,7 @@ static int run(mddev_t *mddev) */ for (j = 0; j < conf->raid_disks && (!conf->mirrors[j].rdev || - !conf->mirrors[j].rdev->in_sync) ; j++) + !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) /* nothing */; conf->last_used = j; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 26114f40bde..867f06ae33d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -512,7 +512,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) disk = r10_bio->devs[slot].devnum; while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - !rdev->in_sync) { + !test_bit(In_sync, &rdev->flags)) { slot++; if (slot == conf->copies) { slot = 0; @@ -529,7 +529,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) slot = 0; disk = r10_bio->devs[slot].devnum; while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - !rdev->in_sync) { + !test_bit(In_sync, &rdev->flags)) { slot ++; if (slot == conf->copies) { disk = -1; @@ -549,7 +549,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || - !rdev->in_sync) + !test_bit(In_sync, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { @@ -585,7 +585,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { + if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); @@ -616,7 +616,7 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !rdev->faulty) { + if (rdev && !test_bit(Faulty, &rdev->flags)) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -771,7 +771,7 @@ static int make_request(request_queue_t *q, struct bio * bio) int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - !rdev->faulty) { + !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; } else @@ -826,7 +826,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && - conf->mirrors[i].rdev->in_sync ? "U" : "_"); + test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); seq_printf(seq, "]"); } @@ -841,7 +841,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * next level up know. * else mark the drive as failed */ - if (rdev->in_sync + if (test_bit(In_sync, &rdev->flags) && conf->working_disks == 1) /* * Don't fail the drive, just return an IO error. @@ -851,7 +851,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * really dead" tests... */ return; - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { mddev->degraded++; conf->working_disks--; /* @@ -859,8 +859,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - rdev->in_sync = 0; - rdev->faulty = 1; + clear_bit(In_sync, &rdev->flags); + set_bit(Faulty, &rdev->flags); mddev->sb_dirty = 1; printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", @@ -885,7 +885,8 @@ static void print_conf(conf_t *conf) tmp = conf->mirrors + i; if (tmp->rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !tmp->rdev->in_sync, !tmp->rdev->faulty, + i, !test_bit(In_sync, &tmp->rdev->flags), + !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); } } @@ -938,11 +939,11 @@ static int raid10_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; if (tmp->rdev - && !tmp->rdev->faulty - && !tmp->rdev->in_sync) { + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_bit(In_sync, &tmp->rdev->flags)) { conf->working_disks++; mddev->degraded--; - tmp->rdev->in_sync = 1; + set_bit(In_sync, &tmp->rdev->flags); } } @@ -1000,7 +1001,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number) print_conf(conf); rdev = p->rdev; if (rdev) { - if (rdev->in_sync || + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; @@ -1416,7 +1417,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0 ; iraid_disks; i++) if (conf->mirrors[i].rdev && - !conf->mirrors[i].rdev->in_sync) { + !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { /* want to reconstruct this device */ r10bio_t *rb2 = r10_bio; @@ -1437,7 +1438,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (j=0; jcopies;j++) { int d = r10_bio->devs[j].devnum; if (conf->mirrors[d].rdev && - conf->mirrors[d].rdev->in_sync) { + test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { /* This is where we read from */ bio = r10_bio->devs[0].bio; bio->bi_next = biolist; @@ -1513,7 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; if (conf->mirrors[d].rdev == NULL || - conf->mirrors[d].rdev->faulty) + test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); @@ -1699,7 +1700,7 @@ static int run(mddev_t *mddev) mddev->queue->max_sectors = (PAGE_SIZE>>9); disk->head_position = 0; - if (!rdev->faulty && rdev->in_sync) + if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) conf->working_disks++; } conf->raid_disks = mddev->raid_disks; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77610b98d4e..d1c488b008a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -525,19 +525,19 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) raid5_conf_t *conf = (raid5_conf_t *) mddev->private; PRINTK("raid5: error called\n"); - if (!rdev->faulty) { + if (!test_bit(Faulty, &rdev->flags)) { mddev->sb_dirty = 1; - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { conf->working_disks--; mddev->degraded++; conf->failed_disks++; - rdev->in_sync = 0; + clear_bit(In_sync, &rdev->flags); /* * if recovery was running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - rdev->faulty = 1; + set_bit(Faulty, &rdev->flags); printk (KERN_ALERT "raid5: Disk failure on %s, disabling device." " Operation continuing on %d devices\n", @@ -1003,12 +1003,12 @@ static void handle_stripe(struct stripe_head *sh) } if (dev->written) written++; rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ - if (!rdev || !rdev->in_sync) { + if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag wil just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !rdev->in_sync + if (!rdev || !test_bit(In_sync, &rdev->flags) || test_bit(R5_ReadError, &dev->flags)) { failed++; failed_num = i; @@ -1027,7 +1027,7 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(R5_ReadError, &sh->dev[i].flags)) { mdk_rdev_t *rdev = conf->disks[i].rdev; - if (rdev && rdev->in_sync) + if (rdev && test_bit(In_sync, &rdev->flags)) /* multiple read failures in one stripe */ md_error(conf->mddev, rdev); } @@ -1384,7 +1384,7 @@ static void handle_stripe(struct stripe_head *sh) rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && rdev->faulty) + if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); @@ -1458,7 +1458,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { + if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); @@ -1503,7 +1503,7 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !rdev->faulty) { + if (rdev && !test_bit(Faulty, &rdev->flags)) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -1850,7 +1850,7 @@ static int run(mddev_t *mddev) disk->rdev = rdev; - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), @@ -2029,7 +2029,7 @@ static void status (struct seq_file *seq, mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->disks[i].rdev && - conf->disks[i].rdev->in_sync ? "U" : "_"); + test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); #if RAID5_DEBUG #define D(x) \ @@ -2056,7 +2056,7 @@ static void print_raid5_conf (raid5_conf_t *conf) tmp = conf->disks + i; if (tmp->rdev) printk(" disk %d, o:%d, dev:%s\n", - i, !tmp->rdev->faulty, + i, !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); } } @@ -2070,12 +2070,12 @@ static int raid5_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev - && !tmp->rdev->faulty - && !tmp->rdev->in_sync) { + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_bit(In_sync, &tmp->rdev->flags)) { mddev->degraded--; conf->failed_disks--; conf->working_disks++; - tmp->rdev->in_sync = 1; + set_bit(In_sync, &tmp->rdev->flags); } } print_raid5_conf(conf); @@ -2092,7 +2092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) print_raid5_conf(conf); rdev = p->rdev; if (rdev) { - if (rdev->in_sync || + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; @@ -2127,7 +2127,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) */ for (disk=0; disk < mddev->raid_disks; disk++) if ((p=conf->disks + disk)->rdev == NULL) { - rdev->in_sync = 0; + clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; found = 1; if (rdev->saved_raid_disk != disk) diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 84f3ee01e4c..eae5a35629c 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -507,19 +507,19 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) raid6_conf_t *conf = (raid6_conf_t *) mddev->private; PRINTK("raid6: error called\n"); - if (!rdev->faulty) { + if (!test_bit(Faulty, &rdev->flags)) { mddev->sb_dirty = 1; - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { conf->working_disks--; mddev->degraded++; conf->failed_disks++; - rdev->in_sync = 0; + clear_bit(In_sync, &rdev->flags); /* * if recovery was running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - rdev->faulty = 1; + set_bit(Faulty, &rdev->flags); printk (KERN_ALERT "raid6: Disk failure on %s, disabling device." " Operation continuing on %d devices\n", @@ -1071,7 +1071,7 @@ static void handle_stripe(struct stripe_head *sh) } if (dev->written) written++; rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ - if (!rdev || !rdev->in_sync) { + if (!rdev || !test_bit(In_sync, &rdev->flags)) { if ( failed < 2 ) failed_num[failed] = i; failed++; @@ -1465,7 +1465,7 @@ static void handle_stripe(struct stripe_head *sh) rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && rdev->faulty) + if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); @@ -1539,7 +1539,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { + if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); @@ -1584,7 +1584,7 @@ static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_lock(); for (i=0; iraid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !rdev->faulty) { + if (rdev && !test_bit(Faulty, &rdev->flags)) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); @@ -1868,7 +1868,7 @@ static int run(mddev_t *mddev) disk->rdev = rdev; - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "raid6: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), @@ -2052,7 +2052,7 @@ static void status (struct seq_file *seq, mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->disks[i].rdev && - conf->disks[i].rdev->in_sync ? "U" : "_"); + test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); #if RAID6_DUMPSTATE seq_printf (seq, "\n"); @@ -2078,7 +2078,7 @@ static void print_raid6_conf (raid6_conf_t *conf) tmp = conf->disks + i; if (tmp->rdev) printk(" disk %d, o:%d, dev:%s\n", - i, !tmp->rdev->faulty, + i, !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); } } @@ -2092,12 +2092,12 @@ static int raid6_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev - && !tmp->rdev->faulty - && !tmp->rdev->in_sync) { + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_bit(In_sync, &tmp->rdev->flags)) { mddev->degraded--; conf->failed_disks--; conf->working_disks++; - tmp->rdev->in_sync = 1; + set_bit(In_sync, &tmp->rdev->flags); } } print_raid6_conf(conf); @@ -2114,7 +2114,7 @@ static int raid6_remove_disk(mddev_t *mddev, int number) print_raid6_conf(conf); rdev = p->rdev; if (rdev) { - if (rdev->in_sync || + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; @@ -2149,7 +2149,7 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) */ for (disk=0; disk < mddev->raid_disks; disk++) if ((p=conf->disks + disk)->rdev == NULL) { - rdev->in_sync = 0; + clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; found = 1; if (rdev->saved_raid_disk != disk) diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 200c69e34fc..11629f92180 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -117,10 +117,10 @@ struct mdk_rdev_s * It can never have faulty==1, in_sync==1 * This reduces the burden of testing multiple flags in many cases */ - int faulty; /* if faulty do not issue IO requests */ - int in_sync; /* device is a full member of the array */ - unsigned long flags; /* Should include faulty and in_sync here. */ + unsigned long flags; +#define Faulty 1 /* device is known to have a fault */ +#define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ int desc_nr; /* descriptor index in the superblock */ @@ -247,7 +247,7 @@ struct mddev_s static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) { - int faulty = rdev->faulty; + int faulty = test_bit(Faulty, &rdev->flags); if (atomic_dec_and_test(&rdev->nr_pending) && faulty) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } -- cgit v1.2.3-70-g09d2 From 3855ad9f398de88a3290f7dd799633c4b73903ea Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:38 -0800 Subject: [PATCH] md: make sure a user-request sync of raid5 ignores intent bitmap A sync of raid5 usually ignore blocks which the bitmap says are in-sync. But a user-request check or repair should not ignore these. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d1c488b008a..51003b008de 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1645,6 +1645,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; -- cgit v1.2.3-70-g09d2 From 96de1e663cda65ba9275afb1bc007f34e5068ba1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:39 -0800 Subject: [PATCH] md: fix some locking and module refcounting issues with md's use of sysfs 1/ I really should be using the __ATTR macros for defining attributes, so that the .owner field get set properly, otherwise modules can be removed while sysfs files are open. This also involves some name changes of _show routines. 2/ Always lock the mddev (against reconfiguration) for all sysfs attribute access. This easily avoid certain races and is completely consistant with other interfaces (ioctl and /proc/mdstat both always lock against reconfiguration). 3/ raid5 attributes must check that the 'conf' structure actually exists (the array could have been stopped while an attribute file was open). 4/ A missing 'kfree' from when the raid5_conf_t was converted to have a kobject embedded, and then converted back again. Signed-off-by: Neil Brown Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 62 ++++++++++++++++++++++++------------------------------ drivers/md/raid5.c | 30 +++++++++++++++----------- 2 files changed, 46 insertions(+), 46 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2b592897609..b2d8813ed71 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1504,7 +1504,7 @@ struct rdev_sysfs_entry { }; static ssize_t -rdev_show_state(mdk_rdev_t *rdev, char *page) +state_show(mdk_rdev_t *rdev, char *page) { char *sep = ""; int len=0; @@ -1525,13 +1525,11 @@ rdev_show_state(mdk_rdev_t *rdev, char *page) return len+sprintf(page+len, "\n"); } -static struct rdev_sysfs_entry rdev_state = { - .attr = {.name = "state", .mode = S_IRUGO }, - .show = rdev_show_state, -}; +static struct rdev_sysfs_entry +rdev_state = __ATTR_RO(state); static ssize_t -rdev_show_super(mdk_rdev_t *rdev, char *page) +super_show(mdk_rdev_t *rdev, char *page) { if (rdev->sb_loaded && rdev->sb_size) { memcpy(page, page_address(rdev->sb_page), rdev->sb_size); @@ -1539,10 +1537,8 @@ rdev_show_super(mdk_rdev_t *rdev, char *page) } else return 0; } -static struct rdev_sysfs_entry rdev_super = { - .attr = {.name = "super", .mode = S_IRUGO }, - .show = rdev_show_super, -}; +static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_super.attr, @@ -1728,7 +1724,7 @@ static void analyze_sbs(mddev_t * mddev) } static ssize_t -md_show_level(mddev_t *mddev, char *page) +level_show(mddev_t *mddev, char *page) { mdk_personality_t *p = mddev->pers; if (p == NULL) @@ -1739,21 +1735,15 @@ md_show_level(mddev_t *mddev, char *page) return sprintf(page, "%s\n", p->name); } -static struct md_sysfs_entry md_level = { - .attr = {.name = "level", .mode = S_IRUGO }, - .show = md_show_level, -}; +static struct md_sysfs_entry md_level = __ATTR_RO(level); static ssize_t -md_show_rdisks(mddev_t *mddev, char *page) +raid_disks_show(mddev_t *mddev, char *page) { return sprintf(page, "%d\n", mddev->raid_disks); } -static struct md_sysfs_entry md_raid_disks = { - .attr = {.name = "raid_disks", .mode = S_IRUGO }, - .show = md_show_rdisks, -}; +static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); static ssize_t md_show_scan(mddev_t *mddev, char *page) @@ -1782,10 +1772,10 @@ md_store_scan(mddev_t *mddev, const char *page, size_t len) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; - down(&mddev->reconfig_sem); + if (mddev->pers && mddev->pers->sync_request) canscan=1; - up(&mddev->reconfig_sem); + if (!canscan) return -EINVAL; @@ -1801,22 +1791,18 @@ md_store_scan(mddev_t *mddev, const char *page, size_t len) } static ssize_t -md_show_mismatch(mddev_t *mddev, char *page) +mismatch_cnt_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long) mddev->resync_mismatches); } -static struct md_sysfs_entry md_scan_mode = { - .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, - .show = md_show_scan, - .store = md_store_scan, -}; +static struct md_sysfs_entry +md_scan_mode = __ATTR(scan_mode, S_IRUGO|S_IWUSR, md_show_scan, md_store_scan); -static struct md_sysfs_entry md_mismatches = { - .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, - .show = md_show_mismatch, -}; + +static struct md_sysfs_entry +md_mismatches = __ATTR_RO(mismatch_cnt); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -1831,10 +1817,14 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + ssize_t rv; if (!entry->show) return -EIO; - return entry->show(mddev, page); + mddev_lock(mddev); + rv = entry->show(mddev, page); + mddev_unlock(mddev); + return rv; } static ssize_t @@ -1843,10 +1833,14 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + ssize_t rv; if (!entry->store) return -EIO; - return entry->store(mddev, page, length); + mddev_lock(mddev); + rv = entry->store(mddev, page, length); + mddev_unlock(mddev); + return rv; } static void md_free(struct kobject *ko) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 51003b008de..e2a40283e32 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1746,7 +1746,10 @@ static ssize_t raid5_show_stripe_cache_size(mddev_t *mddev, char *page) { raid5_conf_t *conf = mddev_to_conf(mddev); - return sprintf(page, "%d\n", conf->max_nr_stripes); + if (conf) + return sprintf(page, "%d\n", conf->max_nr_stripes); + else + return 0; } static ssize_t @@ -1757,6 +1760,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) int new; if (len >= PAGE_SIZE) return -EINVAL; + if (!conf) + return -ENODEV; new = simple_strtoul(page, &end, 10); if (!*page || (*end && *end != '\n') ) @@ -1777,23 +1782,23 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) return len; } -static struct md_sysfs_entry raid5_stripecache_size = { - .attr = {.name = "stripe_cache_size", .mode = S_IRUGO | S_IWUSR }, - .show = raid5_show_stripe_cache_size, - .store = raid5_store_stripe_cache_size, -}; +static struct md_sysfs_entry +raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, + raid5_show_stripe_cache_size, + raid5_store_stripe_cache_size); static ssize_t -raid5_show_stripe_cache_active(mddev_t *mddev, char *page) +stripe_cache_active_show(mddev_t *mddev, char *page) { raid5_conf_t *conf = mddev_to_conf(mddev); - return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); + if (conf) + return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); + else + return 0; } -static struct md_sysfs_entry raid5_stripecache_active = { - .attr = {.name = "stripe_cache_active", .mode = S_IRUGO}, - .show = raid5_show_stripe_cache_active, -}; +static struct md_sysfs_entry +raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, @@ -1981,6 +1986,7 @@ static int stop(mddev_t *mddev) free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); + kfree(conf); mddev->private = NULL; return 0; } -- cgit v1.2.3-70-g09d2 From 700e432d8364ce59c521abbe03a522051610ebc2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 28 Nov 2005 13:44:10 -0800 Subject: [PATCH] md: fix locking problem in r5/r6 bitmap_unplug actually writes data (bits) to storage, so we shouldn't be holding a spinlock... Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 2 ++ drivers/md/raid6main.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e2a40283e32..36d5f8ac826 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1704,7 +1704,9 @@ static void raid5d (mddev_t *mddev) if (conf->seq_flush - conf->seq_write > 0) { int seq = conf->seq_flush; + spin_unlock_irq(&conf->device_lock); bitmap_unplug(mddev->bitmap); + spin_lock_irq(&conf->device_lock); conf->seq_write = seq; activate_bit_delay(conf); } diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index eae5a35629c..17d88d90d1e 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1784,7 +1784,9 @@ static void raid6d (mddev_t *mddev) if (conf->seq_flush - conf->seq_write > 0) { int seq = conf->seq_flush; + spin_unlock_irq(&conf->device_lock); bitmap_unplug(mddev->bitmap); + spin_lock_irq(&conf->device_lock); conf->seq_write = seq; activate_bit_delay(conf); } -- cgit v1.2.3-70-g09d2