summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig23
-rw-r--r--drivers/md/Makefile7
-rw-r--r--drivers/md/bitmap.c71
-rw-r--r--drivers/md/dm-crypt.c19
-rw-r--r--drivers/md/dm-emc.c345
-rw-r--r--drivers/md/dm-hw-handler.c213
-rw-r--r--drivers/md/dm-hw-handler.h63
-rw-r--r--drivers/md/dm-linear.c38
-rw-r--r--drivers/md/dm-log.c4
-rw-r--r--drivers/md/dm-mpath-hp-sw.c247
-rw-r--r--drivers/md/dm-mpath-rdac.c700
-rw-r--r--drivers/md/dm-mpath.c186
-rw-r--r--drivers/md/dm-mpath.h1
-rw-r--r--drivers/md/dm-snap.c163
-rw-r--r--drivers/md/dm-snap.h11
-rw-r--r--drivers/md/dm-table.c13
-rw-r--r--drivers/md/dm.c46
-rw-r--r--drivers/md/dm.h6
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c31
-rw-r--r--drivers/md/md.c690
-rw-r--r--drivers/md/multipath.c21
-rw-r--r--drivers/md/raid0.c19
-rw-r--r--drivers/md/raid1.c63
-rw-r--r--drivers/md/raid10.c59
-rw-r--r--drivers/md/raid5.c819
26 files changed, 1311 insertions, 2549 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 610af916891..07d92c11b5d 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -252,27 +252,14 @@ config DM_ZERO
config DM_MULTIPATH
tristate "Multipath target"
depends on BLK_DEV_DM
+ # nasty syntax but means make DM_MULTIPATH independent
+ # of SCSI_DH if the latter isn't defined but if
+ # it is, DM_MULTIPATH must depend on it. We get a build
+ # error if SCSI_DH=m and DM_MULTIPATH=y
+ depends on SCSI_DH || !SCSI_DH
---help---
Allow volume managers to support multipath hardware.
-config DM_MULTIPATH_EMC
- tristate "EMC CX/AX multipath support"
- depends on DM_MULTIPATH && BLK_DEV_DM
- ---help---
- Multipath support for EMC CX/AX series hardware.
-
-config DM_MULTIPATH_RDAC
- tristate "LSI/Engenio RDAC multipath support (EXPERIMENTAL)"
- depends on DM_MULTIPATH && BLK_DEV_DM && SCSI && EXPERIMENTAL
- ---help---
- Multipath support for LSI/Engenio RDAC.
-
-config DM_MULTIPATH_HP
- tristate "HP MSA multipath support (EXPERIMENTAL)"
- depends on DM_MULTIPATH && BLK_DEV_DM && SCSI && EXPERIMENTAL
- ---help---
- Multipath support for HP MSA (Active/Passive) series hardware.
-
config DM_DELAY
tristate "I/O delaying target (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 7be09eeea29..f1ef33dfd8c 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -4,11 +4,9 @@
dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-ioctl.o dm-io.o dm-kcopyd.o
-dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
+dm-multipath-objs := dm-path-selector.o dm-mpath.o
dm-snapshot-objs := dm-snap.o dm-exception-store.o
dm-mirror-objs := dm-raid1.o
-dm-rdac-objs := dm-mpath-rdac.o
-dm-hp-sw-objs := dm-mpath-hp-sw.o
md-mod-objs := md.o bitmap.o
raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \
@@ -35,9 +33,6 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
-obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
-obj-$(CONFIG_DM_MULTIPATH_HP) += dm-hp-sw.o
-obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rdac.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c14dacdacfa..621a272a2c7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -203,17 +203,6 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
* bitmap file handling - read and write the bitmap file and its superblock
*/
-/* copy the pathname of a file to a buffer */
-char *file_path(struct file *file, char *buf, int count)
-{
- if (!buf)
- return NULL;
-
- buf = d_path(&file->f_path, buf, count);
-
- return IS_ERR(buf) ? NULL : buf;
-}
-
/*
* basic page I/O operations
*/
@@ -236,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
|| test_bit(Faulty, &rdev->flags))
continue;
- target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
+ target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
page->index = index;
@@ -252,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev;
- struct list_head *tmp;
mddev_t *mddev = bitmap->mddev;
- rdev_for_each(rdev, tmp, mddev)
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
if (test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) {
int size = PAGE_SIZE;
@@ -271,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ (long)(page->index * (PAGE_SIZE/512))
+ size/512 > 0)
/* bitmap runs in to metadata */
- return -EINVAL;
+ goto bad_alignment;
if (rdev->data_offset + mddev->size*2
- > rdev->sb_offset*2 + bitmap->offset)
+ > rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */
- return -EINVAL;
- } else if (rdev->sb_offset*2 < rdev->data_offset) {
+ goto bad_alignment;
+ } else if (rdev->sb_start < rdev->data_offset) {
/* METADATA BITMAP DATA */
- if (rdev->sb_offset*2
+ if (rdev->sb_start
+ bitmap->offset
+ page->index*(PAGE_SIZE/512) + size/512
> rdev->data_offset)
/* bitmap runs in to data */
- return -EINVAL;
+ goto bad_alignment;
} else {
/* DATA METADATA BITMAP - no problems */
}
md_super_write(mddev, rdev,
- (rdev->sb_offset<<1) + bitmap->offset
+ rdev->sb_start + bitmap->offset
+ page->index * (PAGE_SIZE/512),
size,
page);
}
+ rcu_read_unlock();
if (wait)
md_super_wait(mddev);
return 0;
+
+ bad_alignment:
+ rcu_read_unlock();
+ return -EINVAL;
}
static void bitmap_file_kick(struct bitmap *bitmap);
@@ -465,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags);
sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events = cpu_to_le64(bitmap->mddev->events);
- if (!bitmap->mddev->degraded)
- sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+ if (bitmap->mddev->events < bitmap->events_cleared) {
+ /* rocking back to read-only */
+ bitmap->events_cleared = bitmap->mddev->events;
+ sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+ }
kunmap_atomic(sb, KM_USER0);
write_page(bitmap, bitmap->sb_page, 1);
}
@@ -721,11 +718,13 @@ static void bitmap_file_kick(struct bitmap *bitmap)
if (bitmap->file) {
path = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (path)
- ptr = file_path(bitmap->file, path, PAGE_SIZE);
+ ptr = d_path(&bitmap->file->f_path, path,
+ PAGE_SIZE);
+
printk(KERN_ALERT
"%s: kicking failed bitmap file %s from array!\n",
- bmname(bitmap), ptr ? ptr : "");
+ bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
kfree(path);
} else
@@ -1094,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
} else
spin_unlock_irqrestore(&bitmap->lock, flags);
lastpage = page;
-/*
- printk("bitmap clean at page %lu\n", j);
-*/
+
+ /* We are possibly going to clear some bits, so make
+ * sure that events_cleared is up-to-date.
+ */
+ if (bitmap->need_sync) {
+ bitmap_super_t *sb;
+ bitmap->need_sync = 0;
+ sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb->events_cleared =
+ cpu_to_le64(bitmap->events_cleared);
+ kunmap_atomic(sb, KM_USER0);
+ write_page(bitmap, bitmap->sb_page, 1);
+ }
spin_lock_irqsave(&bitmap->lock, flags);
clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
@@ -1266,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
return;
}
+ if (success &&
+ bitmap->events_cleared < bitmap->mddev->events) {
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->need_sync = 1;
+ }
+
if (!success && ! (*bmc & NEEDED_MASK))
*bmc |= NEEDED_MASK;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 835def11419..13956437bc8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -432,6 +432,7 @@ static int crypt_convert(struct crypt_config *cc,
case 0:
atomic_dec(&ctx->pending);
ctx->sector++;
+ cond_resched();
continue;
/* error */
@@ -1215,9 +1216,24 @@ error:
return -EINVAL;
}
+static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct crypt_config *cc = ti->private;
+ struct request_queue *q = bdev_get_queue(cc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = cc->dev->bdev;
+ bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
static struct target_type crypt_target = {
.name = "crypt",
- .version= {1, 5, 0},
+ .version= {1, 6, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
@@ -1227,6 +1243,7 @@ static struct target_type crypt_target = {
.preresume = crypt_preresume,
.resume = crypt_resume,
.message = crypt_message,
+ .merge = crypt_merge,
};
static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
deleted file mode 100644
index 3ea5ad4b780..00000000000
--- a/drivers/md/dm-emc.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
- *
- * This file is released under the GPL.
- *
- * Multipath support for EMC CLARiiON AX/CX-series hardware.
- */
-
-#include "dm.h"
-#include "dm-hw-handler.h"
-#include <scsi/scsi.h>
-#include <scsi/scsi_cmnd.h>
-
-#define DM_MSG_PREFIX "multipath emc"
-
-struct emc_handler {
- spinlock_t lock;
-
- /* Whether we should send the short trespass command (FC-series)
- * or the long version (default for AX/CX CLARiiON arrays). */
- unsigned short_trespass;
- /* Whether or not to honor SCSI reservations when initiating a
- * switch-over. Default: Don't. */
- unsigned hr;
-
- unsigned char sense[SCSI_SENSE_BUFFERSIZE];
-};
-
-#define TRESPASS_PAGE 0x22
-#define EMC_FAILOVER_TIMEOUT (60 * HZ)
-
-/* Code borrowed from dm-lsi-rdac by Mike Christie */
-
-static inline void free_bio(struct bio *bio)
-{
- __free_page(bio->bi_io_vec[0].bv_page);
- bio_put(bio);
-}
-
-static void emc_endio(struct bio *bio, int error)
-{
- struct dm_path *path = bio->bi_private;
-
- /* We also need to look at the sense keys here whether or not to
- * switch to the next PG etc.
- *
- * For now simple logic: either it works or it doesn't.
- */
- if (error)
- dm_pg_init_complete(path, MP_FAIL_PATH);
- else
- dm_pg_init_complete(path, 0);
-
- /* request is freed in block layer */
- free_bio(bio);
-}
-
-static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size)
-{
- struct bio *bio;
- struct page *page;
-
- bio = bio_alloc(GFP_ATOMIC, 1);
- if (!bio) {
- DMERR("get_failover_bio: bio_alloc() failed.");
- return NULL;
- }
-
- bio->bi_rw |= (1 << BIO_RW);
- bio->bi_bdev = path->dev->bdev;
- bio->bi_sector = 0;
- bio->bi_private = path;
- bio->bi_end_io = emc_endio;
-
- page = alloc_page(GFP_ATOMIC);
- if (!page) {
- DMERR("get_failover_bio: alloc_page() failed.");
- bio_put(bio);
- return NULL;
- }
-
- if (bio_add_page(bio, page, data_size, 0) != data_size) {
- DMERR("get_failover_bio: bio_add_page() failed.");
- __free_page(page);
- bio_put(bio);
- return NULL;
- }
-
- return bio;
-}
-
-static struct request *get_failover_req(struct emc_handler *h,
- struct bio *bio, struct dm_path *path)
-{
- struct request *rq;
- struct block_device *bdev = bio->bi_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
-
- /* FIXME: Figure out why it fails with GFP_ATOMIC. */
- rq = blk_get_request(q, WRITE, __GFP_WAIT);
- if (!rq) {
- DMERR("get_failover_req: blk_get_request failed");
- return NULL;
- }
-
- blk_rq_append_bio(q, rq, bio);
-
- rq->sense = h->sense;
- memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
- rq->sense_len = 0;
-
- rq->timeout = EMC_FAILOVER_TIMEOUT;
- rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
-
- return rq;
-}
-
-static struct request *emc_trespass_get(struct emc_handler *h,
- struct dm_path *path)
-{
- struct bio *bio;
- struct request *rq;
- unsigned char *page22;
- unsigned char long_trespass_pg[] = {
- 0, 0, 0, 0,
- TRESPASS_PAGE, /* Page code */
- 0x09, /* Page length - 2 */
- h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
- 0xff, 0xff, /* Trespass target */
- 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */
- };
- unsigned char short_trespass_pg[] = {
- 0, 0, 0, 0,
- TRESPASS_PAGE, /* Page code */
- 0x02, /* Page length - 2 */
- h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
- 0xff, /* Trespass target */
- };
- unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
- sizeof(long_trespass_pg);
-
- /* get bio backing */
- if (data_size > PAGE_SIZE)
- /* this should never happen */
- return NULL;
-
- bio = get_failover_bio(path, data_size);
- if (!bio) {
- DMERR("emc_trespass_get: no bio");
- return NULL;
- }
-
- page22 = (unsigned char *)bio_data(bio);
- memset(page22, 0, data_size);
-
- memcpy(page22, h->short_trespass ?
- short_trespass_pg : long_trespass_pg, data_size);
-
- /* get request for block layer packet command */
- rq = get_failover_req(h, bio, path);
- if (!rq) {
- DMERR("emc_trespass_get: no rq");
- free_bio(bio);
- return NULL;
- }
-
- /* Prepare the command. */
- rq->cmd[0] = MODE_SELECT;
- rq->cmd[1] = 0x10;
- rq->cmd[4] = data_size;
- rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
-
- return rq;
-}
-
-static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
- struct dm_path *path)
-{
- struct request *rq;
- struct request_queue *q = bdev_get_queue(path->dev->bdev);
-
- /*
- * We can either blindly init the pg (then look at the sense),
- * or we can send some commands to get the state here (then
- * possibly send the fo cmnd), or we can also have the
- * initial state passed into us and then get an update here.
- */
- if (!q) {
- DMINFO("emc_pg_init: no queue");
- goto fail_path;
- }
-
- /* FIXME: The request should be pre-allocated. */
- rq = emc_trespass_get(hwh->context, path);
- if (!rq) {
- DMERR("emc_pg_init: no rq");
- goto fail_path;
- }
-
- DMINFO("emc_pg_init: sending switch-over command");
- elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
- return;
-
-fail_path:
- dm_pg_init_complete(path, MP_FAIL_PATH);
-}
-
-static struct emc_handler *alloc_emc_handler(void)
-{
- struct emc_handler *h = kzalloc(sizeof(*h), GFP_KERNEL);
-
- if (h)
- spin_lock_init(&h->lock);
-
- return h;
-}
-
-static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
-{
- struct emc_handler *h;
- unsigned hr, short_trespass;
-
- if (argc == 0) {
- /* No arguments: use defaults */
- hr = 0;
- short_trespass = 0;
- } else if (argc != 2) {
- DMWARN("incorrect number of arguments");
- return -EINVAL;
- } else {
- if ((sscanf(argv[0], "%u", &short_trespass) != 1)
- || (short_trespass > 1)) {
- DMWARN("invalid trespass mode selected");
- return -EINVAL;
- }
-
- if ((sscanf(argv[1], "%u", &hr) != 1)
- || (hr > 1)) {
- DMWARN("invalid honor reservation flag selected");
- return -EINVAL;
- }
- }
-
- h = alloc_emc_handler();
- if (!h)
- return -ENOMEM;
-
- hwh->context = h;
-
- if ((h->short_trespass = short_trespass))
- DMWARN("short trespass command will be send");
- else
- DMWARN("long trespass command will be send");
-
- if ((h->hr = hr))
- DMWARN("honor reservation bit will be set");
- else
- DMWARN("honor reservation bit will not be set (default)");
-
- return 0;
-}
-
-static void emc_destroy(struct hw_handler *hwh)
-{
- struct emc_handler *h = (struct emc_handler *) hwh->context;
-
- kfree(h);
- hwh->context = NULL;
-}
-
-static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
-{
- /* FIXME: Patch from axboe still missing */
-#if 0
- int sense;
-
- if (bio->bi_error & BIO_SENSE) {
- sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
-
- if (sense == 0x020403) {
- /* LUN Not Ready - Manual Intervention Required
- * indicates this is a passive path.
- *
- * FIXME: However, if this is seen and EVPD C0
- * indicates that this is due to a NDU in
- * progress, we should set FAIL_PATH too.
- * This indicates we might have to do a SCSI
- * inquiry in the end_io path. Ugh. */
- return MP_BYPASS_PG | MP_RETRY_IO;
- } else if (sense == 0x052501) {
- /* An array based copy is in progress. Do not
- * fail the path, do not bypass to another PG,
- * do not retry. Fail the IO immediately.
- * (Actually this is the same conclusion as in
- * the default handler, but lets make sure.) */
- return 0;
- } else if (sense == 0x062900) {
- /* Unit Attention Code. This is the first IO
- * to the new path, so just retry. */
- return MP_RETRY_IO;
- }
- }
-#endif
-
- /* Try default handler */
- return dm_scsi_err_handler(hwh, bio);
-}
-
-static struct hw_handler_type emc_hwh = {
- .name = "emc",
- .module = THIS_MODULE,
- .create = emc_create,
- .destroy = emc_destroy,
- .pg_init = emc_pg_init,
- .error = emc_error,
-};
-
-static int __init dm_emc_init(void)
-{
- int r = dm_register_hw_handler(&emc_hwh);
-
- if (r < 0)
- DMERR("register failed %d", r);
-
- DMINFO("version 0.0.3 loaded");
-
- return r;
-}
-
-static void __exit dm_emc_exit(void)
-{
- int r = dm_unregister_hw_handler(&emc_hwh);
-
- if (r < 0)
- DMERR("unregister failed %d", r);
-}
-
-module_init(dm_emc_init);
-module_exit(dm_emc_exit);
-
-MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
-MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c
deleted file mode 100644
index 2ee84d8aa0b..00000000000
--- a/drivers/md/dm-hw-handler.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
- *
- * This file is released under the GPL.
- *
- * Multipath hardware handler registration.
- */
-
-#include "dm.h"
-#include "dm-hw-handler.h"
-
-#include <linux/slab.h>
-
-struct hwh_internal {
- struct hw_handler_type hwht;
-
- struct list_head list;
- long use;
-};
-
-#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
-
-static LIST_HEAD(_hw_handlers);
-static DECLARE_RWSEM(_hwh_lock);
-
-static struct hwh_internal *__find_hw_handler_type(const char *name)
-{
- struct hwh_internal *hwhi;
-
- list_for_each_entry(hwhi, &_hw_handlers, list) {
- if (!strcmp(name, hwhi->hwht.name))
- return hwhi;
- }
-
- return NULL;
-}
-
-static struct hwh_internal *get_hw_handler(const char *name)
-{
- struct hwh_internal *hwhi;
-
- down_read(&_hwh_lock);
- hwhi = __find_hw_handler_type(name);
- if (hwhi) {
- if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
- hwhi = NULL;
- else
- hwhi->use++;
- }
- up_read(&_hwh_lock);
-
- return hwhi;
-}
-
-struct hw_handler_type *dm_get_hw_handler(const char *name)
-{
- struct hwh_internal *hwhi;
-
- if (!name)
- return NULL;
-
- hwhi = get_hw_handler(name);
- if (!hwhi) {
- request_module("dm-%s", name);
- hwhi = get_hw_handler(name);
- }
-
- return hwhi ? &hwhi->hwht : NULL;
-}
-
-void dm_put_hw_handler(struct hw_handler_type *hwht)
-{
- struct hwh_internal *hwhi;
-
- if (!hwht)
- return;
-
- down_read(&_hwh_lock);
- hwhi = __find_hw_handler_type(hwht->name);
- if (!hwhi)
- goto out;
-
- if (--hwhi->use == 0)
- module_put(hwhi->hwht.module);
-
- BUG_ON(hwhi->use < 0);
-
- out:
- up_read(&_hwh_lock);
-}
-
-static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
-{
- struct hwh_internal *hwhi = kzalloc(sizeof(*hwhi), GFP_KERNEL);
-
- if (hwhi)
- hwhi->hwht = *hwht;
-
- return hwhi;
-}
-
-int dm_register_hw_handler(struct hw_handler_type *hwht)
-{
- int r = 0;
- struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
-
- if (!hwhi)
- return -ENOMEM;
-
- down_write(&_hwh_lock);
-
- if (__find_hw_handler_type(hwht->name)) {
- kfree(hwhi);
- r = -EEXIST;
- } else
- list_add(&hwhi->list, &_hw_handlers);
-
- up_write(&_hwh_lock);
-
- return r;
-}
-
-int dm_unregister_hw_handler(struct hw_handler_type *hwht)
-{
- struct hwh_internal *hwhi;
-
- down_write(&_hwh_lock);
-
- hwhi = __find_hw_handler_type(hwht->name);
- if (!hwhi) {
- up_write(&_hwh_lock);
- return -EINVAL;
- }
-
- if (hwhi->use) {
- up_write(&_hwh_lock);
- return -ETXTBSY;
- }
-
- list_del(&hwhi->list);
-
- up_write(&_hwh_lock);
-
- kfree(hwhi);
-
- return 0;
-}
-
-unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
-{
-#if 0
- int sense_key, asc, ascq;
-
- if (bio->bi_error & BIO_SENSE) {
- /* FIXME: This is just an initial guess. */
- /* key / asc / ascq */
- sense_key = (bio->bi_error >> 16) & 0xff;
- asc = (bio->bi_error >> 8) & 0xff;
- ascq = bio->bi_error & 0xff;
-
- switch (sense_key) {
- /* This block as a whole comes from the device.
- * So no point retrying on another path. */
- case 0x03: /* Medium error */
- case 0x05: /* Illegal request */
- case 0x07: /* Data protect */
- case 0x08: /* Blank check */
- case 0x0a: /* copy aborted */
- case 0x0c: /* obsolete - no clue ;-) */
- case 0x0d: /* volume overflow */
- case 0x0e: /* data miscompare */
- case 0x0f: /* reserved - no idea either. */
- return MP_ERROR_IO;
-
- /* For these errors it's unclear whether they
- * come from the device or the controller.
- * So just lets try a different path, and if
- * it eventually succeeds, user-space will clear
- * the paths again... */
- case 0x02: /* Not ready */
- case 0x04: /* Hardware error */
- case 0x09: /* vendor specific */
- case 0x0b: /* Aborted command */
- return MP_FAIL_PATH;
-
- case 0x06: /* Unit attention - might want to decode */
- if (asc == 0x04 && ascq == 0x01)
- /* "Unit in the process of
- * becoming ready" */
- return 0;
- return MP_FAIL_PATH;
-
- /* FIXME: For Unit Not Ready we may want
- * to have a generic pg activation
- * feature (START_UNIT). */
-
- /* Should these two ever end up in the
- * error path? I don't think so. */
- case 0x00: /* No sense */
- case 0x01: /* Recovered error */
- return 0;
- }
- }
-#endif
-
- /* We got no idea how to decode the other kinds of errors ->
- * assume generic error condition. */
- return MP_FAIL_PATH;
-}
-
-EXPORT_SYMBOL_GPL(dm_register_hw_handler);
-EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
-EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
deleted file mode 100644
index 46809dcb121..00000000000
--- a/drivers/md/dm-hw-handler.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
- *
- * This file is released under the GPL.
- *
- * Multipath hardware handler registration.
- */
-
-#ifndef DM_HW_HANDLER_H
-#define DM_HW_HANDLER_H
-
-#include <linux/device-mapper.h>
-
-#include "dm-mpath.h"
-
-struct hw_handler_type;
-struct hw_handler {
- struct hw_handler_type *type;
- struct mapped_device *md;
- void *context;
-};
-
-/*
- * Constructs a hardware handler object, takes custom arguments
- */
-/* Information about a hardware handler type */
-struct hw_handler_type {
- char *name;
- struct module *module;
-
- int (*create) (struct hw_handler *handler, unsigned int argc,
- char **argv);
- void (*destroy) (struct hw_handler *hwh);
-
- void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
- struct dm_path *path);
- unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
- int (*status) (struct hw_handler *hwh, status_type_t type,
- char *result, unsigned int maxlen);
-};
-
-/* Register a hardware handler */
-int dm_register_hw_handler(struct hw_handler_type *type);
-
-/* Unregister a hardware handler */
-int dm_unregister_hw_handler(struct hw_handler_type *type);
-
-/* Returns a registered hardware handler type */
-struct hw_handler_type *dm_get_hw_handler(const char *name);
-
-/* Releases a hardware handler */
-void dm_put_hw_handler(struct hw_handler_type *hwht);
-
-/* Default err function */
-unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
-
-/* Error flags for err and dm_pg_init_complete */
-#define MP_FAIL_PATH 1
-#define MP_BYPASS_PG 2
-#define MP_ERROR_IO 4 /* Don't retry this I/O */
-#define MP_RETRY 8
-
-#endif
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 17753d80ad2..6449bcdf84c 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -69,13 +69,25 @@ static void linear_dtr(struct dm_target *ti)
kfree(lc);
}
-static int linear_map(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
+static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
{
- struct linear_c *lc = (struct linear_c *) ti->private;
+ struct linear_c *lc = ti->private;
+
+ return lc->start + (bi_sector - ti->begin);
+}
+
+static void linear_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct linear_c *lc = ti->private;
bio->bi_bdev = lc->dev->bdev;
- bio->bi_sector = lc->start + (bio->bi_sector - ti->begin);
+ bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+}
+
+static int linear_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ linear_map_bio(ti, bio);
return DM_MAPIO_REMAPPED;
}
@@ -114,15 +126,31 @@ static int linear_ioctl(struct dm_target *ti, struct inode *inode,
return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
}
+static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct linear_c *lc = ti->private;
+ struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = lc->dev->bdev;
+ bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
static struct target_type linear_target = {
.name = "linear",
- .version= {1, 0, 2},
+ .version= {1, 0, 3},
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
.ioctl = linear_ioctl,
+ .merge = linear_merge,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 67a6f31b7fc..5b48478c79f 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -831,7 +831,7 @@ static struct dm_dirty_log_type _disk_type = {
.status = disk_status,
};
-int __init dm_dirty_log_init(void)
+static int __init dm_dirty_log_init(void)
{
int r;
@@ -848,7 +848,7 @@ int __init dm_dirty_log_init(void)
return r;
}
-void __exit dm_dirty_log_exit(void)
+static void __exit dm_dirty_log_exit(void)
{
dm_dirty_log_type_unregister(&_disk_type);
dm_dirty_log_type_unregister(&_core_type);
diff --git a/drivers/md/dm-mpath-hp-sw.c b/drivers/md/dm-mpath-hp-sw.c
deleted file mode 100644
index b63a0ab37c5..00000000000
--- a/drivers/md/dm-mpath-hp-sw.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (C) 2005 Mike Christie, All rights reserved.
- * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
- * Authors: Mike Christie
- * Dave Wysochanski
- *
- * This file is released under the GPL.
- *
- * This module implements the specific path activation code for
- * HP StorageWorks and FSC FibreCat Asymmetric (Active/Passive)
- * storage arrays.
- * These storage arrays have controller-based failover, not
- * LUN-based failover. However, LUN-based failover is the design
- * of dm-multipath. Thus, this module is written for LUN-based failover.
- */
-#include <linux/blkdev.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <scsi/scsi.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/scsi_dbg.h>
-
-#include "dm.h"
-#include "dm-hw-handler.h"
-
-#define DM_MSG_PREFIX "multipath hp-sw"
-#define DM_HP_HWH_NAME "hp-sw"
-#define DM_HP_HWH_VER "1.0.0"
-
-struct hp_sw_context {
- unsigned char sense[SCSI_SENSE_BUFFERSIZE];
-};
-
-/*
- * hp_sw_error_is_retryable - Is an HP-specific check condition retryable?
- * @req: path activation request
- *
- * Examine error codes of request and determine whether the error is retryable.
- * Some error codes are already retried by scsi-ml (see
- * scsi_decide_disposition), but some HP specific codes are not.
- * The intent of this routine is to supply the logic for the HP specific
- * check conditions.
- *
- * Returns:
- * 1 - command completed with retryable error
- * 0 - command completed with non-retryable error
- *
- * Possible optimizations
- * 1. More hardware-specific error codes
- */
-static int hp_sw_error_is_retryable(struct request *req)
-{
- /*
- * NOT_READY is known to be retryable
- * For now we just dump out the sense data and call it retryable
- */
- if (status_byte(req->errors) == CHECK_CONDITION)
- __scsi_print_sense(DM_HP_HWH_NAME, req->sense, req->sense_len);
-
- /*
- * At this point we don't have complete information about all the error
- * codes from this hardware, so we are just conservative and retry
- * when in doubt.
- */
- return 1;
-}
-
-/*
- * hp_sw_end_io - Completion handler for HP path activation.
- * @req: path activation request
- * @error: scsi-ml error
- *
- * Check sense data, free request structure, and notify dm that
- * pg initialization has completed.
- *
- * Context: scsi-ml softirq
- *
- */
-static void hp_sw_end_io(struct request *req, int error)
-{
- struct dm_path *path = req->end_io_data;
- unsigned err_flags = 0;
-
- if (!error) {
- DMDEBUG("%s path activation command - success",
- path->dev->name);
- goto out;
- }
-
- if (hp_sw_error_is_retryable(req)) {
- DMDEBUG("%s path activation command - retry",
- path->dev->name);
- err_flags = MP_RETRY;
- goto out;
- }
-
- DMWARN("%s path activation fail - error=0x%x",
- path->dev->name, error);
- err_flags = MP_FAIL_PATH;
-
-out:
- req->end_io_data = NULL;
- __blk_put_request(req->q, req);
- dm_pg_init_complete(path, err_flags);
-}
-
-/*
- * hp_sw_get_request - Allocate an HP specific path activation request
- * @path: path on which request will be sent (needed for request queue)
- *
- * The START command is used for path activation request.
- * These arrays are controller-based failover, not LUN based.
- * One START command issued to a single path will fail over all
- * LUNs for the same controller.
- *
- * Possible optimizations
- * 1. Make timeout configurable
- * 2. Preallocate request
- */
-static struct request *hp_sw_get_request(struct dm_path *path)
-{
- struct request *req;
- struct block_device *bdev = path->dev->bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct hp_sw_context *h = path->hwhcontext;
-
- req = blk_get_request(q, WRITE, GFP_NOIO);
- if (!req)
- goto out;
-
- req->timeout = 60 * HZ;
-
- req->errors = 0;
- req->cmd_type = REQ_TYPE_BLOCK_PC;
- req->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
- req->end_io_data = path;
- req->sense = h->sense;
- memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE);
-
- req->cmd[0] = START_STOP;
- req->cmd[4] = 1;
- req->cmd_len = COMMAND_SIZE(req->cmd[0]);
-
-out:
- return req;
-}
-
-/*
- * hp_sw_pg_init - HP path activation implementation.
- * @hwh: hardware handler specific data
- * @bypassed: unused; is the path group bypassed? (see dm-mpath.c)
- * @path: path to send initialization command
- *
- * Send an HP-specific path activation command on 'path'.
- * Do not try to optimize in any way, just send the activation command.
- * More than one path activation command may be sent to the same controller.
- * This seems to work fine for basic failover support.
- *
- * Possible optimizations
- * 1. Detect an in-progress activation request and avoid submitting another one
- * 2. Model the controller and only send a single activation request at a time
- * 3. Determine the state of a path before sending an activation request
- *
- * Context: kmpathd (see process_queued_ios() in dm-mpath.c)
- */
-static void hp_sw_pg_init(struct hw_handler *hwh, unsigned bypassed,
- struct dm_path *path)
-{
- struct request *req;
- struct hp_sw_context *h;
-
- path->hwhcontext = hwh->context;
- h = hwh->context;
-
- req = hp_sw_get_request(path);
- if (!req) {
- DMERR("%s path activation command - allocation fail",
- path->dev->name);
- goto retry;
- }
-
- DMDEBUG("%s path activation command - sent", path->dev->name);
-
- blk_execute_rq_nowait(req->q, NULL, req, 1, hp_sw_end_io);
- return;
-
-retry:
- dm_pg_init_complete(path, MP_RETRY);
-}
-
-static int hp_sw_create(struct hw_handler *hwh, unsigned argc, char **argv)
-{
- struct hp_sw_context *h;
-
- h = kmalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- hwh->context = h;
-
- return 0;
-}
-
-static void hp_sw_destroy(struct hw_handler *hwh)
-{
- struct hp_sw_context *h = hwh->context;
-
- kfree(h);
-}
-
-static struct hw_handler_type hp_sw_hwh = {
- .name = DM_HP_HWH_NAME,
- .module = THIS_MODULE,
- .create = hp_sw_create,
- .destroy = hp_sw_destroy,
- .pg_init = hp_sw_pg_init,
-};
-
-static int __init hp_sw_init(void)
-{
- int r;
-
- r = dm_register_hw_handler(&hp_sw_hwh);
- if (r < 0)
- DMERR("register failed %d", r);
- else
- DMINFO("version " DM_HP_HWH_VER " loaded");
-
- return r;
-}
-
-static void __exit hp_sw_exit(void)
-{
- int r;
-
- r = dm_unregister_hw_handler(&hp_sw_hwh);
- if (r < 0)
- DMERR("unregister failed %d", r);
-}
-
-module_init(hp_sw_init);
-module_exit(hp_sw_exit);
-
-MODULE_DESCRIPTION("DM Multipath HP StorageWorks / FSC FibreCat (A/P) support");
-MODULE_AUTHOR("Mike Christie, Dave Wysochanski <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(DM_HP_HWH_VER);
diff --git a/drivers/md/dm-mpath-rdac.c b/drivers/md/dm-mpath-rdac.c
deleted file mode 100644
index 95e77734880..00000000000
--- a/drivers/md/dm-mpath-rdac.c
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Engenio/LSI RDAC DM HW handler
- *
- * Copyright (C) 2005 Mike Christie. All rights reserved.
- * Copyright (C) Chandra Seetharaman, IBM Corp. 2007
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- */
-#include <scsi/scsi.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/scsi_eh.h>
-
-#define DM_MSG_PREFIX "multipath rdac"
-
-#include "dm.h"
-#include "dm-hw-handler.h"
-
-#define RDAC_DM_HWH_NAME "rdac"
-#define RDAC_DM_HWH_VER "0.4"
-
-/*
- * LSI mode page stuff
- *
- * These struct definitions and the forming of the
- * mode page were taken from the LSI RDAC 2.4 GPL'd
- * driver, and then converted to Linux conventions.
- */
-#define RDAC_QUIESCENCE_TIME 20;
-/*
- * Page Codes
- */
-#define RDAC_PAGE_CODE_REDUNDANT_CONTROLLER 0x2c
-
-/*
- * Controller modes definitions
- */
-#define RDAC_MODE_TRANSFER_ALL_LUNS 0x01
-#define RDAC_MODE_TRANSFER_SPECIFIED_LUNS 0x02
-
-/*
- * RDAC Options field
- */
-#define RDAC_FORCED_QUIESENCE 0x02
-
-#define RDAC_FAILOVER_TIMEOUT (60 * HZ)
-
-struct rdac_mode_6_hdr {
- u8 data_len;
- u8 medium_type;
- u8 device_params;
- u8 block_desc_len;
-};
-
-struct rdac_mode_10_hdr {
- u16 data_len;
- u8 medium_type;
- u8 device_params;
- u16 reserved;
- u16 block_desc_len;
-};
-
-struct rdac_mode_common {
- u8 controller_serial[16];
- u8 alt_controller_serial[16];
- u8 rdac_mode[2];
- u8 alt_rdac_mode[2];
- u8 quiescence_timeout;
- u8 rdac_options;
-};
-
-struct rdac_pg_legacy {
- struct rdac_mode_6_hdr hdr;
- u8 page_code;
- u8 page_len;
- struct rdac_mode_common common;
-#define MODE6_MAX_LUN 32
- u8 lun_table[MODE6_MAX_LUN];
- u8 reserved2[32];
- u8 reserved3;
- u8 reserved4;
-};
-
-struct rdac_pg_expanded {
- struct rdac_mode_10_hdr hdr;
- u8 page_code;
- u8 subpage_code;
- u8 page_len[2];
- struct rdac_mode_common common;
- u8 lun_table[256];
- u8 reserved3;
- u8 reserved4;
-};
-
-struct c9_inquiry {
- u8 peripheral_info;
- u8 page_code; /* 0xC9 */
- u8 reserved1;
- u8 page_len;
- u8 page_id[4]; /* "vace" */
- u8 avte_cvp;
- u8 path_prio;
- u8 reserved2[38];
-};
-
-#define SUBSYS_ID_LEN 16
-#define SLOT_ID_LEN 2
-
-struct c4_inquiry {
- u8 peripheral_info;
- u8 page_code; /* 0xC4 */
- u8 reserved1;
- u8 page_len;
- u8 page_id[4]; /* "subs" */
- u8 subsys_id[SUBSYS_ID_LEN];
- u8 revision[4];
- u8 slot_id[SLOT_ID_LEN];
- u8 reserved[2];
-};
-
-struct rdac_controller {
- u8 subsys_id[SUBSYS_ID_LEN];
- u8 slot_id[SLOT_ID_LEN];
- int use_10_ms;
- struct kref kref;
- struct list_head node; /* list of all controllers */
- spinlock_t lock;
- int submitted;
- struct list_head cmd_list; /* list of commands to be submitted */
- union {
- struct rdac_pg_legacy legacy;
- struct rdac_pg_expanded expanded;
- } mode_select;
-};
-struct c8_inquiry {
- u8 peripheral_info;
- u8 page_code; /* 0xC8 */
- u8 reserved1;
- u8 page_len;
- u8 page_id[4]; /* "edid" */
- u8 reserved2[3];
- u8 vol_uniq_id_len;
- u8 vol_uniq_id[16];
- u8 vol_user_label_len;
- u8 vol_user_label[60];
- u8 array_uniq_id_len;
- u8 array_unique_id[16];
- u8 array_user_label_len;
- u8 array_user_label[60];
- u8 lun[8];
-};
-
-struct c2_inquiry {
- u8 peripheral_info;
- u8 page_code; /* 0xC2 */
- u8 reserved1;
- u8 page_len;
- u8 page_id[4]; /* "swr4" */
- u8 sw_version[3];
- u8 sw_date[3];
- u8 features_enabled;
- u8 max_lun_supported;
- u8 partitions[239]; /* Total allocation length should be 0xFF */
-};
-
-struct rdac_handler {
- struct list_head entry; /* list waiting to submit MODE SELECT */
- unsigned timeout;
- struct rdac_controller *ctlr;
-#define UNINITIALIZED_LUN (1 << 8)
- unsigned lun;
- unsigned char sense[SCSI_SENSE_BUFFERSIZE];
- struct dm_path *path;
- struct work_struct work;
-#define SEND_C2_INQUIRY 1
-#define SEND_C4_INQUIRY 2
-#define SEND_C8_INQUIRY 3
-#define SEND_C9_INQUIRY 4
-#define SEND_MODE_SELECT 5
- int cmd_to_send;
- union {
- struct c2_inquiry c2;
- struct c4_inquiry c4;
- struct c8_inquiry c8;
- struct c9_inquiry c9;
- } inq;
-};
-
-static LIST_HEAD(ctlr_list);
-static DEFINE_SPINLOCK(list_lock);
-static struct workqueue_struct *rdac_wkqd;
-
-static inline int had_failures(struct request *req, int error)
-{
- return (error || host_byte(req->errors) != DID_OK ||
- msg_byte(req->errors) != COMMAND_COMPLETE);
-}
-
-static void rdac_resubmit_all(struct rdac_handler *h)
-{
- struct rdac_controller *ctlr = h->ctlr;
- struct rdac_handler *tmp, *h1;
-
- spin_lock(&ctlr->lock);
- list_for_each_entry_safe(h1, tmp, &ctlr->cmd_list, entry) {
- h1->cmd_to_send = SEND_C9_INQUIRY;
- queue_work(rdac_wkqd, &h1->work);
- list_del(&h1->entry);
- }
- ctlr->submitted = 0;
- spin_unlock(&ctlr->lock);
-}
-
-static void mode_select_endio(struct request *req, int error)
-{
- struct rdac_handler *h = req->end_io_data;
- struct scsi_sense_hdr sense_hdr;
- int sense = 0, fail = 0;
-
- if (had_failures(req, error)) {
- fail = 1;
- goto failed;
- }
-
- if (status_byte(req->errors) == CHECK_CONDITION) {
- scsi_normalize_sense(req->sense, SCSI_SENSE_BUFFERSIZE,
- &sense_hdr);
- sense = (sense_hdr.sense_key << 16) | (sense_hdr.asc << 8) |
- sense_hdr.ascq;
- /* If it is retryable failure, submit the c9 inquiry again */
- if (sense == 0x59136 || sense == 0x68b02 || sense == 0xb8b02 ||
- sense == 0x62900) {
- /* 0x59136 - Command lock contention
- * 0x[6b]8b02 - Quiesense in progress or achieved
- * 0x62900 - Power On, Reset, or Bus Device Reset
- */
- h->cmd_to_send = SEND_C9_INQUIRY;
- queue_work(rdac_wkqd, &h->work);
- goto done;
- }
- if (sense)
- DMINFO("MODE_SELECT failed on %s with sense 0x%x",
- h->path->dev->name, sense);
- }
-failed:
- if (fail || sense)
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
- else
- dm_pg_init_complete(h->path, 0);
-
-done:
- rdac_resubmit_all(h);
- __blk_put_request(req->q, req);
-}
-
-static struct request *get_rdac_req(struct rdac_handler *h,
- void *buffer, unsigned buflen, int rw)
-{
- struct request *rq;
- struct request_queue *q = bdev_get_queue(h->path->dev->bdev);
-
- rq = blk_get_request(q, rw, GFP_KERNEL);
-
- if (!rq) {
- DMINFO("get_rdac_req: blk_get_request failed");
- return NULL;
- }
-
- if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_KERNEL)) {
- blk_put_request(rq);
- DMINFO("get_rdac_req: blk_rq_map_kern failed");
- return NULL;
- }
-
- rq->sense = h->sense;
- memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
- rq->sense_len = 0;
-
- rq->end_io_data = h;
- rq->timeout = h->timeout;
- rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
- return rq;
-}
-
-static struct request *rdac_failover_get(struct rdac_handler *h)
-{
- struct request *rq;
- struct rdac_mode_common *common;
- unsigned data_size;
-
- if (h->ctlr->use_10_ms) {
- struct rdac_pg_expanded *rdac_pg;
-
- data_size = sizeof(struct rdac_pg_expanded);
- rdac_pg = &h->ctlr->mode_select.expanded;
- memset(rdac_pg, 0, data_size);
- common = &rdac_pg->common;
- rdac_pg->page_code = RDAC_PAGE_CODE_REDUNDANT_CONTROLLER + 0x40;
- rdac_pg->subpage_code = 0x1;
- rdac_pg->page_len[0] = 0x01;
- rdac_pg->page_len[1] = 0x28;
- rdac_pg->lun_table[h->lun] = 0x81;
- } else {
- struct rdac_pg_legacy *rdac_pg;
-
- data_size = sizeof(struct rdac_pg_legacy);
- rdac_pg = &h->ctlr->mode_select.legacy;
- memset(rdac_pg, 0, data_size);
- common = &rdac_pg->common;
- rdac_pg->page_code = RDAC_PAGE_CODE_REDUNDANT_CONTROLLER;
- rdac_pg->page_len = 0x68;
- rdac_pg->lun_table[h->lun] = 0x81;
- }
- common->rdac_mode[1] = RDAC_MODE_TRANSFER_SPECIFIED_LUNS;
- common->quiescence_timeout = RDAC_QUIESCENCE_TIME;
- common->rdac_options = RDAC_FORCED_QUIESENCE;
-
- /* get request for block layer packet command */
- rq = get_rdac_req(h, &h->ctlr->mode_select, data_size, WRITE);
- if (!rq) {
- DMERR("rdac_failover_get: no rq");
- return NULL;
- }
-
- /* Prepare the command. */
- if (h->ctlr->use_10_ms) {
- rq->cmd[0] = MODE_SELECT_10;
- rq->cmd[7] = data_size >> 8;
- rq->cmd[8] = data_size & 0xff;
- } else {
- rq->cmd[0] = MODE_SELECT;
- rq->cmd[4] = data_size;
- }
- rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
-
- return rq;
-}
-
-/* Acquires h->ctlr->lock */
-static void submit_mode_select(struct rdac_handler *h)
-{
- struct request *rq;
- struct request_queue *q = bdev_get_queue(h->path->dev->bdev);
-
- spin_lock(&h->ctlr->lock);
- if (h->ctlr->submitted) {
- list_add(&h->entry, &h->ctlr->cmd_list);
- goto drop_lock;
- }
-
- if (!q) {
- DMINFO("submit_mode_select: no queue");
- goto fail_path;
- }
-
- rq = rdac_failover_get(h);
- if (!rq) {
- DMERR("submit_mode_select: no rq");
- goto fail_path;
- }
-
- DMINFO("queueing MODE_SELECT command on %s", h->path->dev->name);
-
- blk_execute_rq_nowait(q, NULL, rq, 1, mode_select_endio);
- h->ctlr->submitted = 1;
- goto drop_lock;
-fail_path:
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
-drop_lock:
- spin_unlock(&h->ctlr->lock);
-}
-
-static void release_ctlr(struct kref *kref)
-{
- struct rdac_controller *ctlr;
- ctlr = container_of(kref, struct rdac_controller, kref);
-
- spin_lock(&list_lock);
- list_del(&ctlr->node);
- spin_unlock(&list_lock);
- kfree(ctlr);
-}
-
-static struct rdac_controller *get_controller(u8 *subsys_id, u8 *slot_id)
-{
- struct rdac_controller *ctlr, *tmp;
-
- spin_lock(&list_lock);
-
- list_for_each_entry(tmp, &ctlr_list, node) {
- if ((memcmp(tmp->subsys_id, subsys_id, SUBSYS_ID_LEN) == 0) &&
- (memcmp(tmp->slot_id, slot_id, SLOT_ID_LEN) == 0)) {
- kref_get(&tmp->kref);
- spin_unlock(&list_lock);
- return tmp;
- }
- }
- ctlr = kmalloc(sizeof(*ctlr), GFP_ATOMIC);
- if (!ctlr)
- goto done;
-
- /* initialize fields of controller */
- memcpy(ctlr->subsys_id, subsys_id, SUBSYS_ID_LEN);
- memcpy(ctlr->slot_id, slot_id, SLOT_ID_LEN);
- kref_init(&ctlr->kref);
- spin_lock_init(&ctlr->lock);
- ctlr->submitted = 0;
- ctlr->use_10_ms = -1;
- INIT_LIST_HEAD(&ctlr->cmd_list);
- list_add(&ctlr->node, &ctlr_list);
-done:
- spin_unlock(&list_lock);
- return ctlr;
-}
-
-static void c4_endio(struct request *req, int error)
-{
- struct rdac_handler *h = req->end_io_data;
- struct c4_inquiry *sp;
-
- if (had_failures(req, error)) {
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
- goto done;
- }
-
- sp = &h->inq.c4;
-
- h->ctlr = get_controller(sp->subsys_id, sp->slot_id);
-
- if (h->ctlr) {
- h->cmd_to_send = SEND_C9_INQUIRY;
- queue_work(rdac_wkqd, &h->work);
- } else
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
-done:
- __blk_put_request(req->q, req);
-}
-
-static void c2_endio(struct request *req, int error)
-{
- struct rdac_handler *h = req->end_io_data;
- struct c2_inquiry *sp;
-
- if (had_failures(req, error)) {
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
- goto done;
- }
-
- sp = &h->inq.c2;
-
- /* If more than MODE6_MAX_LUN luns are supported, use mode select 10 */
- if (sp->max_lun_supported >= MODE6_MAX_LUN)
- h->ctlr->use_10_ms = 1;
- else
- h->ctlr->use_10_ms = 0;
-
- h->cmd_to_send = SEND_MODE_SELECT;
- queue_work(rdac_wkqd, &h->work);
-done:
- __blk_put_request(req->q, req);
-}
-
-static void c9_endio(struct request *req, int error)
-{
- struct rdac_handler *h = req->end_io_data;
- struct c9_inquiry *sp;
-
- if (had_failures(req, error)) {
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
- goto done;
- }
-
- /* We need to look at the sense keys here to take clear action.
- * For now simple logic: If the host is in AVT mode or if controller
- * owns the lun, return dm_pg_init_complete(), otherwise submit
- * MODE SELECT.
- */
- sp = &h->inq.c9;
-
- /* If in AVT mode, return success */
- if ((sp->avte_cvp >> 7) == 0x1) {
- dm_pg_init_complete(h->path, 0);
- goto done;
- }
-
- /* If the controller on this path owns the LUN, return success */
- if (sp->avte_cvp & 0x1) {
- dm_pg_init_complete(h->path, 0);
- goto done;
- }
-
- if (h->ctlr) {
- if (h->ctlr->use_10_ms == -1)
- h->cmd_to_send = SEND_C2_INQUIRY;
- else
- h->cmd_to_send = SEND_MODE_SELECT;
- } else
- h->cmd_to_send = SEND_C4_INQUIRY;
- queue_work(rdac_wkqd, &h->work);
-done:
- __blk_put_request(req->q, req);
-}
-
-static void c8_endio(struct request *req, int error)
-{
- struct rdac_handler *h = req->end_io_data;
- struct c8_inquiry *sp;
-
- if (had_failures(req, error)) {
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
- goto done;
- }
-
- /* We need to look at the sense keys here to take clear action.
- * For now simple logic: Get the lun from the inquiry page.
- */
- sp = &h->inq.c8;
- h->lun = sp->lun[7]; /* currently it uses only one byte */
- h->cmd_to_send = SEND_C9_INQUIRY;
- queue_work(rdac_wkqd, &h->work);
-done:
- __blk_put_request(req->q, req);
-}
-
-static void submit_inquiry(struct rdac_handler *h, int page_code,
- unsigned int len, rq_end_io_fn endio)
-{
- struct request *rq;
- struct request_queue *q = bdev_get_queue(h->path->dev->bdev);
-
- if (!q)
- goto fail_path;
-
- rq = get_rdac_req(h, &h->inq, len, READ);
- if (!rq)
- goto fail_path;
-
- /* Prepare the command. */
- rq->cmd[0] = INQUIRY;
- rq->cmd[1] = 1;
- rq->cmd[2] = page_code;
- rq->cmd[4] = len;
- rq->cmd_len = COMMAND_SIZE(INQUIRY);
- blk_execute_rq_nowait(q, NULL, rq, 1, endio);
- return;
-
-fail_path:
- dm_pg_init_complete(h->path, MP_FAIL_PATH);
-}
-
-static void service_wkq(struct work_struct *work)
-{
- struct rdac_handler *h = container_of(work, struct rdac_handler, work);
-
- switch (h->cmd_to_send) {
- case SEND_C2_INQUIRY:
- submit_inquiry(h, 0xC2, sizeof(struct c2_inquiry), c2_endio);
- break;
- case SEND_C4_INQUIRY:
- submit_inquiry(h, 0xC4, sizeof(struct c4_inquiry), c4_endio);
- break;
- case SEND_C8_INQUIRY:
- submit_inquiry(h, 0xC8, sizeof(struct c8_inquiry), c8_endio);
- break;
- case SEND_C9_INQUIRY:
- submit_inquiry(h, 0xC9, sizeof(struct c9_inquiry), c9_endio);
- break;
- case SEND_MODE_SELECT:
- submit_mode_select(h);
- break;
- default:
- BUG();
- }
-}
-/*
- * only support subpage2c until we confirm that this is just a matter of
- * of updating firmware or not, and RDAC (basic AVT works already) for now
- * but we can add these in in when we get time and testers
- */
-static int rdac_create(struct hw_handler *hwh, unsigned argc, char **argv)
-{
- struct rdac_handler *h;
- unsigned timeout;
-
- if (argc == 0) {
- /* No arguments: use defaults */
- timeout = RDAC_FAILOVER_TIMEOUT;
- } else if (argc != 1) {
- DMWARN("incorrect number of arguments");
- return -EINVAL;
- } else {
- if (sscanf(argv[1], "%u", &timeout) != 1) {
- DMWARN("invalid timeout value");
- return -EINVAL;
- }
- }
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- hwh->context = h;
- h->timeout = timeout;
- h->lun = UNINITIALIZED_LUN;
- INIT_WORK(&h->work, service_wkq);
- DMWARN("using RDAC command with timeout %u", h->timeout);
-
- return 0;
-}
-
-static void rdac_destroy(struct hw_handler *hwh)
-{
- struct rdac_handler *h = hwh->context;
-
- if (h->ctlr)
- kref_put(&h->ctlr->kref, release_ctlr);
- kfree(h);
- hwh->context = NULL;
-}
-
-static unsigned rdac_error(struct hw_handler *hwh, struct bio *bio)
-{
- /* Try default handler */
- return dm_scsi_err_handler(hwh, bio);
-}
-
-static void rdac_pg_init(struct hw_handler *hwh, unsigned bypassed,
- struct dm_path *path)
-{
- struct rdac_handler *h = hwh->context;
-
- h->path = path;
- switch (h->lun) {
- case UNINITIALIZED_LUN:
- submit_inquiry(h, 0xC8, sizeof(struct c8_inquiry), c8_endio);
- break;
- default:
- submit_inquiry(h, 0xC9, sizeof(struct c9_inquiry), c9_endio);
- }
-}
-
-static struct hw_handler_type rdac_handler = {
- .name = RDAC_DM_HWH_NAME,
- .module = THIS_MODULE,
- .create = rdac_create,
- .destroy = rdac_destroy,
- .pg_init = rdac_pg_init,
- .error = rdac_error,
-};
-
-static int __init rdac_init(void)
-{
- int r;
-
- rdac_wkqd = create_singlethread_workqueue("rdac_wkqd");
- if (!rdac_wkqd) {
- DMERR("Failed to create workqueue rdac_wkqd.");
- return -ENOMEM;
- }
-
- r = dm_register_hw_handler(&rdac_handler);
- if (r < 0) {
- DMERR("%s: register failed %d", RDAC_DM_HWH_NAME, r);
- destroy_workqueue(rdac_wkqd);
- return r;
- }
-
- DMINFO("%s: version %s loaded", RDAC_DM_HWH_NAME, RDAC_DM_HWH_VER);
- return 0;
-}
-
-static void __exit rdac_exit(void)
-{
- int r = dm_unregister_hw_handler(&rdac_handler);
-
- destroy_workqueue(rdac_wkqd);
- if (r < 0)
- DMERR("%s: unregister failed %d", RDAC_DM_HWH_NAME, r);
-}
-
-module_init(rdac_init);
-module_exit(rdac_exit);
-
-MODULE_DESCRIPTION("DM Multipath LSI/Engenio RDAC support");
-MODULE_AUTHOR("Mike Christie, Chandra Seetharaman");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(RDAC_DM_HWH_VER);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e7ee59e655d..71dd65aa31b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -7,7 +7,6 @@
#include "dm.h"
#include "dm-path-selector.h"
-#include "dm-hw-handler.h"
#include "dm-bio-list.h"
#include "dm-bio-record.h"
#include "dm-uevent.h"
@@ -20,6 +19,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/workqueue.h>
+#include <scsi/scsi_dh.h>
#include <asm/atomic.h>
#define DM_MSG_PREFIX "multipath"
@@ -61,7 +61,8 @@ struct multipath {
spinlock_t lock;
- struct hw_handler hw_handler;
+ const char *hw_handler_name;
+ struct work_struct activate_path;
unsigned nr_priority_groups;
struct list_head priority_groups;
unsigned pg_init_required; /* pg_init needs calling? */
@@ -106,9 +107,10 @@ typedef int (*action_fn) (struct pgpath *pgpath);
static struct kmem_cache *_mpio_cache;
-static struct workqueue_struct *kmultipathd;
+static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void process_queued_ios(struct work_struct *work);
static void trigger_event(struct work_struct *work);
+static void activate_path(struct work_struct *work);
/*-----------------------------------------------
@@ -145,9 +147,12 @@ static struct priority_group *alloc_priority_group(void)
static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
{
struct pgpath *pgpath, *tmp;
+ struct multipath *m = ti->private;
list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
list_del(&pgpath->list);
+ if (m->hw_handler_name)
+ scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
dm_put_device(ti, pgpath->path.dev);
free_pgpath(pgpath);
}
@@ -178,6 +183,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
INIT_WORK(&m->trigger_event, trigger_event);
+ INIT_WORK(&m->activate_path, activate_path);
m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
@@ -193,18 +199,13 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
static void free_multipath(struct multipath *m)
{
struct priority_group *pg, *tmp;
- struct hw_handler *hwh = &m->hw_handler;
list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
list_del(&pg->list);
free_priority_group(pg, m->ti);
}
- if (hwh->type) {
- hwh->type->destroy(hwh);
- dm_put_hw_handler(hwh->type);
- }
-
+ kfree(m->hw_handler_name);
mempool_destroy(m->mpio_pool);
kfree(m);
}
@@ -216,12 +217,10 @@ static void free_multipath(struct multipath *m)
static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
{
- struct hw_handler *hwh = &m->hw_handler;
-
m->current_pg = pgpath->pg;
/* Must we initialise the PG first, and queue I/O till it's ready? */
- if (hwh->type && hwh->type->pg_init) {
+ if (m->hw_handler_name) {
m->pg_init_required = 1;
m->queue_io = 1;
} else {
@@ -409,7 +408,6 @@ static void process_queued_ios(struct work_struct *work)
{
struct multipath *m =
container_of(work, struct multipath, process_queued_ios);
- struct hw_handler *hwh = &m->hw_handler;
struct pgpath *pgpath = NULL;
unsigned init_required = 0, must_queue = 1;
unsigned long flags;
@@ -439,7 +437,7 @@ out:
spin_unlock_irqrestore(&m->lock, flags);
if (init_required)
- hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
+ queue_work(kmpath_handlerd, &m->activate_path);
if (!must_queue)
dispatch_queued_ios(m);
@@ -530,8 +528,10 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
}
r = read_param(_params, shift(as), &ps_argc, &ti->error);
- if (r)
+ if (r) {
+ dm_put_path_selector(pst);
return -EINVAL;
+ }
r = pst->create(&pg->ps, ps_argc, as->argv);
if (r) {
@@ -551,6 +551,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
{
int r;
struct pgpath *p;
+ struct multipath *m = ti->private;
/* we need at least a path arg */
if (as->argc < 1) {
@@ -569,6 +570,15 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
goto bad;
}
+ if (m->hw_handler_name) {
+ r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
+ m->hw_handler_name);
+ if (r < 0) {
+ dm_put_device(ti, p->path.dev);
+ goto bad;
+ }
+ }
+
r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
if (r) {
dm_put_device(ti, p->path.dev);
@@ -628,8 +638,10 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
struct pgpath *pgpath;
struct arg_set path_args;
- if (as->argc < nr_params)
+ if (as->argc < nr_params) {
+ ti->error = "not enough path parameters";
goto bad;
+ }
path_args.argc = nr_params;
path_args.argv = as->argv;
@@ -652,8 +664,6 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
static int parse_hw_handler(struct arg_set *as, struct multipath *m)
{
- int r;
- struct hw_handler_type *hwht;
unsigned hw_argc;
struct dm_target *ti = m->ti;
@@ -661,30 +671,20 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
{0, 1024, "invalid number of hardware handler args"},
};
- r = read_param(_params, shift(as), &hw_argc, &ti->error);
- if (r)
+ if (read_param(_params, shift(as), &hw_argc, &ti->error))
return -EINVAL;
if (!hw_argc)
return 0;
- hwht = dm_get_hw_handler(shift(as));
- if (!hwht) {
+ m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
+ request_module("scsi_dh_%s", m->hw_handler_name);
+ if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
ti->error = "unknown hardware handler type";
+ kfree(m->hw_handler_name);
+ m->hw_handler_name = NULL;
return -EINVAL;
}
-
- m->hw_handler.md = dm_table_get_md(ti->table);
- dm_put(m->hw_handler.md);
-
- r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
- if (r) {
- dm_put_hw_handler(hwht);
- ti->error = "hardware handler constructor failed";
- return r;
- }
-
- m->hw_handler.type = hwht;
consume(as, hw_argc - 1);
return 0;
@@ -808,6 +808,7 @@ static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = (struct multipath *) ti->private;
+ flush_workqueue(kmpath_handlerd);
flush_workqueue(kmultipathd);
free_multipath(m);
}
@@ -883,7 +884,7 @@ static int reinstate_path(struct pgpath *pgpath)
if (pgpath->path.is_active)
goto out;
- if (!pgpath->pg->ps.type) {
+ if (!pgpath->pg->ps.type->reinstate_path) {
DMWARN("Reinstate path not supported by path selector %s",
pgpath->pg->ps.type->name);
r = -EINVAL;
@@ -1025,52 +1026,85 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
return limit_reached;
}
-/*
- * pg_init must call this when it has completed its initialisation
- */
-void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
+static void pg_init_done(struct dm_path *path, int errors)
{
struct pgpath *pgpath = path_to_pgpath(path);
struct priority_group *pg = pgpath->pg;
struct multipath *m = pg->m;
unsigned long flags;
- /*
- * If requested, retry pg_init until maximum number of retries exceeded.
- * If retry not requested and PG already bypassed, always fail the path.
- */
- if (err_flags & MP_RETRY) {
- if (pg_init_limit_reached(m, pgpath))
- err_flags |= MP_FAIL_PATH;
- } else if (err_flags && pg->bypassed)
- err_flags |= MP_FAIL_PATH;
-
- if (err_flags & MP_FAIL_PATH)
+ /* device or driver problems */
+ switch (errors) {
+ case SCSI_DH_OK:
+ break;
+ case SCSI_DH_NOSYS:
+ if (!m->hw_handler_name) {
+ errors = 0;
+ break;
+ }
+ DMERR("Cannot failover device because scsi_dh_%s was not "
+ "loaded.", m->hw_handler_name);
+ /*
+ * Fail path for now, so we do not ping pong
+ */
fail_path(pgpath);
-
- if (err_flags & MP_BYPASS_PG)
+ break;
+ case SCSI_DH_DEV_TEMP_BUSY:
+ /*
+ * Probably doing something like FW upgrade on the
+ * controller so try the other pg.
+ */
bypass_pg(m, pg, 1);
+ break;
+ /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
+ case SCSI_DH_RETRY:
+ case SCSI_DH_IMM_RETRY:
+ case SCSI_DH_RES_TEMP_UNAVAIL:
+ if (pg_init_limit_reached(m, pgpath))
+ fail_path(pgpath);
+ errors = 0;
+ break;
+ default:
+ /*
+ * We probably do not want to fail the path for a device
+ * error, but this is what the old dm did. In future
+ * patches we can do more advanced handling.
+ */
+ fail_path(pgpath);
+ }
spin_lock_irqsave(&m->lock, flags);
- if (err_flags & ~MP_RETRY) {
+ if (errors) {
+ DMERR("Could not failover device. Error %d.", errors);
m->current_pgpath = NULL;
m->current_pg = NULL;
- } else if (!m->pg_init_required)
+ } else if (!m->pg_init_required) {
m->queue_io = 0;
+ pg->bypassed = 0;
+ }
m->pg_init_in_progress = 0;
queue_work(kmultipathd, &m->process_queued_ios);
spin_unlock_irqrestore(&m->lock, flags);
}
+static void activate_path(struct work_struct *work)
+{
+ int ret;
+ struct multipath *m =
+ container_of(work, struct multipath, activate_path);
+ struct dm_path *path = &m->current_pgpath->path;
+
+ ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
+ pg_init_done(path, ret);
+}
+
/*
* end_io handling
*/
static int do_end_io(struct multipath *m, struct bio *bio,
int error, struct dm_mpath_io *mpio)
{
- struct hw_handler *hwh = &m->hw_handler;
- unsigned err_flags = MP_FAIL_PATH; /* Default behavior */
unsigned long flags;
if (!error)
@@ -1097,19 +1131,8 @@ static int do_end_io(struct multipath *m, struct bio *bio,
}
spin_unlock_irqrestore(&m->lock, flags);
- if (hwh->type && hwh->type->error)
- err_flags = hwh->type->error(hwh, bio);
-
- if (mpio->pgpath) {
- if (err_flags & MP_FAIL_PATH)
- fail_path(mpio->pgpath);
-
- if (err_flags & MP_BYPASS_PG)
- bypass_pg(m, mpio->pgpath->pg, 1);
- }
-
- if (err_flags & MP_ERROR_IO)
- return -EIO;
+ if (mpio->pgpath)
+ fail_path(mpio->pgpath);
requeue:
dm_bio_restore(&mpio->details, bio);
@@ -1194,7 +1217,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
int sz = 0;
unsigned long flags;
struct multipath *m = (struct multipath *) ti->private;
- struct hw_handler *hwh = &m->hw_handler;
struct priority_group *pg;
struct pgpath *p;
unsigned pg_num;
@@ -1214,12 +1236,10 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
DMEMIT("pg_init_retries %u ", m->pg_init_retries);
}
- if (hwh->type && hwh->type->status)
- sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
- else if (!hwh->type || type == STATUSTYPE_INFO)
+ if (!m->hw_handler_name || type == STATUSTYPE_INFO)
DMEMIT("0 ");
else
- DMEMIT("1 %s ", hwh->type->name);
+ DMEMIT("1 %s ", m->hw_handler_name);
DMEMIT("%u ", m->nr_priority_groups);
@@ -1422,6 +1442,21 @@ static int __init dm_multipath_init(void)
return -ENOMEM;
}
+ /*
+ * A separate workqueue is used to handle the device handlers
+ * to avoid overloading existing workqueue. Overloading the
+ * old workqueue would also create a bottleneck in the
+ * path of the storage hardware device activation.
+ */
+ kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+ if (!kmpath_handlerd) {
+ DMERR("failed to create workqueue kmpath_handlerd");
+ destroy_workqueue(kmultipathd);
+ dm_unregister_target(&multipath_target);
+ kmem_cache_destroy(_mpio_cache);
+ return -ENOMEM;
+ }
+
DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
multipath_target.version[2]);
@@ -1433,6 +1468,7 @@ static void __exit dm_multipath_exit(void)
{
int r;
+ destroy_workqueue(kmpath_handlerd);
destroy_workqueue(kmultipathd);
r = dm_unregister_target(&multipath_target);
@@ -1441,8 +1477,6 @@ static void __exit dm_multipath_exit(void)
kmem_cache_destroy(_mpio_cache);
}
-EXPORT_SYMBOL_GPL(dm_pg_init_complete);
-
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
index b9cdcbb3ed5..c198b856a45 100644
--- a/drivers/md/dm-mpath.h
+++ b/drivers/md/dm-mpath.h
@@ -16,7 +16,6 @@ struct dm_path {
unsigned is_active; /* Read-only */
void *pscontext; /* For path-selector use */
- void *hwhcontext; /* For hw-handler use */
};
/* Callback for hwh_pg_init_fn to use when complete */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1ba8a47d61b..6e5528aecc9 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -40,6 +40,11 @@
*/
#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
+/*
+ * The size of the mempool used to track chunks in use.
+ */
+#define MIN_IOS 256
+
static struct workqueue_struct *ksnapd;
static void flush_queued_bios(struct work_struct *work);
@@ -91,7 +96,63 @@ struct dm_snap_pending_exception {
*/
static struct kmem_cache *exception_cache;
static struct kmem_cache *pending_cache;
-static mempool_t *pending_pool;
+
+struct dm_snap_tracked_chunk {
+ struct hlist_node node;
+ chunk_t chunk;
+};
+
+static struct kmem_cache *tracked_chunk_cache;
+
+static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
+ chunk_t chunk)
+{
+ struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
+ GFP_NOIO);
+ unsigned long flags;
+
+ c->chunk = chunk;
+
+ spin_lock_irqsave(&s->tracked_chunk_lock, flags);
+ hlist_add_head(&c->node,
+ &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
+ spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
+
+ return c;
+}
+
+static void stop_tracking_chunk(struct dm_snapshot *s,
+ struct dm_snap_tracked_chunk *c)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->tracked_chunk_lock, flags);
+ hlist_del(&c->node);
+ spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
+
+ mempool_free(c, s->tracked_chunk_pool);
+}
+
+static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
+{
+ struct dm_snap_tracked_chunk *c;
+ struct hlist_node *hn;
+ int found = 0;
+
+ spin_lock_irq(&s->tracked_chunk_lock);
+
+ hlist_for_each_entry(c, hn,
+ &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
+ if (c->chunk == chunk) {
+ found = 1;
+ break;
+ }
+ }
+
+ spin_unlock_irq(&s->tracked_chunk_lock);
+
+ return found;
+}
/*
* One of these per registered origin, held in the snapshot_origins hash
@@ -302,14 +363,19 @@ static void free_exception(struct dm_snap_exception *e)
kmem_cache_free(exception_cache, e);
}
-static struct dm_snap_pending_exception *alloc_pending_exception(void)
+static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
{
- return mempool_alloc(pending_pool, GFP_NOIO);
+ struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
+ GFP_NOIO);
+
+ pe->snap = s;
+
+ return pe;
}
static void free_pending_exception(struct dm_snap_pending_exception *pe)
{
- mempool_free(pe, pending_pool);
+ mempool_free(pe, pe->snap->pending_pool);
}
static void insert_completed_exception(struct dm_snapshot *s,
@@ -482,6 +548,7 @@ static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dm_snapshot *s;
+ int i;
int r = -EINVAL;
char persistent;
char *origin_path;
@@ -564,11 +631,30 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad5;
}
+ s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
+ if (!s->pending_pool) {
+ ti->error = "Could not allocate mempool for pending exceptions";
+ goto bad6;
+ }
+
+ s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
+ tracked_chunk_cache);
+ if (!s->tracked_chunk_pool) {
+ ti->error = "Could not allocate tracked_chunk mempool for "
+ "tracking reads";
+ goto bad_tracked_chunk_pool;
+ }
+
+ for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
+
+ spin_lock_init(&s->tracked_chunk_lock);
+
/* Metadata must only be loaded into one table at once */
r = s->store.read_metadata(&s->store);
if (r < 0) {
ti->error = "Failed to read snapshot metadata";
- goto bad6;
+ goto bad_load_and_register;
} else if (r > 0) {
s->valid = 0;
DMWARN("Snapshot is marked invalid.");
@@ -582,7 +668,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (register_snapshot(s)) {
r = -EINVAL;
ti->error = "Cannot register snapshot origin";
- goto bad6;
+ goto bad_load_and_register;
}
ti->private = s;
@@ -590,6 +676,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return 0;
+ bad_load_and_register:
+ mempool_destroy(s->tracked_chunk_pool);
+
+ bad_tracked_chunk_pool:
+ mempool_destroy(s->pending_pool);
+
bad6:
dm_kcopyd_client_destroy(s->kcopyd_client);
@@ -624,6 +716,9 @@ static void __free_exceptions(struct dm_snapshot *s)
static void snapshot_dtr(struct dm_target *ti)
{
+#ifdef CONFIG_DM_DEBUG
+ int i;
+#endif
struct dm_snapshot *s = ti->private;
flush_workqueue(ksnapd);
@@ -632,8 +727,17 @@ static void snapshot_dtr(struct dm_target *ti)
/* After this returns there can be no new kcopyd jobs. */
unregister_snapshot(s);
+#ifdef CONFIG_DM_DEBUG
+ for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+ BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
+#endif
+
+ mempool_destroy(s->tracked_chunk_pool);
+
__free_exceptions(s);
+ mempool_destroy(s->pending_pool);
+
dm_put_device(ti, s->origin);
dm_put_device(ti, s->cow);
@@ -772,6 +876,13 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
}
/*
+ * Check for conflicting reads. This is extremely improbable,
+ * so yield() is sufficient and there is no need for a wait queue.
+ */
+ while (__chunk_is_tracked(s, pe->e.old_chunk))
+ yield();
+
+ /*
* Add a proper exception, and remove the
* in-flight exception from the list.
*/
@@ -873,7 +984,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
* to hold the lock while we do this.
*/
up_write(&s->lock);
- pe = alloc_pending_exception();
+ pe = alloc_pending_exception(s);
down_write(&s->lock);
if (!s->valid) {
@@ -893,7 +1004,6 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
bio_list_init(&pe->snapshot_bios);
pe->primary_pe = NULL;
atomic_set(&pe->ref_count, 0);
- pe->snap = s;
pe->started = 0;
if (s->store.prepare_exception(&s->store, &pe->e)) {
@@ -974,14 +1084,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
start_copy(pe);
goto out;
}
- } else
- /*
- * FIXME: this read path scares me because we
- * always use the origin when we have a pending
- * exception. However I can't think of a
- * situation where this is wrong - ejt.
- */
+ } else {
bio->bi_bdev = s->origin->bdev;
+ map_context->ptr = track_chunk(s, chunk);
+ }
out_unlock:
up_write(&s->lock);
@@ -989,6 +1095,18 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
return r;
}
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
+ int error, union map_info *map_context)
+{
+ struct dm_snapshot *s = ti->private;
+ struct dm_snap_tracked_chunk *c = map_context->ptr;
+
+ if (c)
+ stop_tracking_chunk(s, c);
+
+ return 0;
+}
+
static void snapshot_resume(struct dm_target *ti)
{
struct dm_snapshot *s = ti->private;
@@ -1266,6 +1384,7 @@ static struct target_type snapshot_target = {
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
.map = snapshot_map,
+ .end_io = snapshot_end_io,
.resume = snapshot_resume,
.status = snapshot_status,
};
@@ -1306,9 +1425,9 @@ static int __init dm_snapshot_init(void)
goto bad4;
}
- pending_pool = mempool_create_slab_pool(128, pending_cache);
- if (!pending_pool) {
- DMERR("Couldn't create pending pool.");
+ tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
+ if (!tracked_chunk_cache) {
+ DMERR("Couldn't create cache to track chunks in use.");
r = -ENOMEM;
goto bad5;
}
@@ -1317,13 +1436,13 @@ static int __init dm_snapshot_init(void)
if (!ksnapd) {
DMERR("Failed to create ksnapd workqueue.");
r = -ENOMEM;
- goto bad6;
+ goto bad_pending_pool;
}
return 0;
- bad6:
- mempool_destroy(pending_pool);
+ bad_pending_pool:
+ kmem_cache_destroy(tracked_chunk_cache);
bad5:
kmem_cache_destroy(pending_cache);
bad4:
@@ -1352,9 +1471,9 @@ static void __exit dm_snapshot_exit(void)
DMERR("origin unregister failed %d", r);
exit_origin_hash();
- mempool_destroy(pending_pool);
kmem_cache_destroy(pending_cache);
kmem_cache_destroy(exception_cache);
+ kmem_cache_destroy(tracked_chunk_cache);
}
/* Module hooks */
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 24f9fb73b98..292c15609ae 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -130,6 +130,10 @@ struct exception_store {
void *context;
};
+#define DM_TRACKED_CHUNK_HASH_SIZE 16
+#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
+ (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+
struct dm_snapshot {
struct rw_semaphore lock;
struct dm_target *ti;
@@ -157,6 +161,8 @@ struct dm_snapshot {
/* The last percentage we notified */
int last_percent;
+ mempool_t *pending_pool;
+
struct exception_table pending;
struct exception_table complete;
@@ -174,6 +180,11 @@ struct dm_snapshot {
/* Queue of snapshot writes for ksnapd to flush */
struct bio_list queued_bios;
struct work_struct queued_bios_work;
+
+ /* Chunks with outstanding reads */
+ mempool_t *tracked_chunk_pool;
+ spinlock_t tracked_chunk_lock;
+ struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
};
/*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 94116eaf470..798e468103b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -506,14 +506,13 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
rs->max_sectors =
min_not_zero(rs->max_sectors, q->max_sectors);
- /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
- * currently doesn't honor MD's merge_bvec_fn routine.
- * In this case, we'll force DM to use PAGE_SIZE or
- * smaller I/O, just to be safe. A better fix is in the
- * works, but add this for the time being so it will at
- * least operate correctly.
+ /*
+ * Check if merge fn is supported.
+ * If not we'll force DM to use PAGE_SIZE or
+ * smaller I/O, just to be safe.
*/
- if (q->merge_bvec_fn)
+
+ if (q->merge_bvec_fn && !ti->type->merge)
rs->max_sectors =
min_not_zero(rs->max_sectors,
(unsigned int) (PAGE_SIZE >> 9));
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 372369b1cc2..bca448e1187 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -37,8 +37,8 @@ static DEFINE_SPINLOCK(_minor_lock);
struct dm_io {
struct mapped_device *md;
int error;
- struct bio *bio;
atomic_t io_count;
+ struct bio *bio;
unsigned long start_time;
};
@@ -829,6 +829,49 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
* CRUD END
*---------------------------------------------------------------*/
+static int dm_merge_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mapped_device *md = q->queuedata;
+ struct dm_table *map = dm_get_table(md);
+ struct dm_target *ti;
+ sector_t max_sectors;
+ int max_size;
+
+ if (unlikely(!map))
+ return 0;
+
+ ti = dm_table_find_target(map, bvm->bi_sector);
+
+ /*
+ * Find maximum amount of I/O that won't need splitting
+ */
+ max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
+ (sector_t) BIO_MAX_SECTORS);
+ max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
+ if (max_size < 0)
+ max_size = 0;
+
+ /*
+ * merge_bvec_fn() returns number of bytes
+ * it can accept at this offset
+ * max is precomputed maximal io size
+ */
+ if (max_size && ti->type->merge)
+ max_size = ti->type->merge(ti, bvm, biovec, max_size);
+
+ /*
+ * Always allow an entire first page
+ */
+ if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
+ max_size = biovec->bv_len;
+
+ dm_table_put(map);
+
+ return max_size;
+}
+
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
@@ -1032,6 +1075,7 @@ static struct mapped_device *alloc_dev(int minor)
blk_queue_make_request(md->queue, dm_request);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
md->queue->unplug_fn = dm_unplug_all;
+ blk_queue_merge_bvec(md->queue, dm_merge_bvec);
md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
if (!md->io_pool)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 8c03b634e62..1e59a0b0a78 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -100,12 +100,6 @@ int dm_lock_for_deletion(struct mapped_device *md);
void dm_kobject_uevent(struct mapped_device *md);
-/*
- * Dirty log
- */
-int dm_dirty_log_init(void);
-void dm_dirty_log_exit(void);
-
int dm_kcopyd_init(void);
void dm_kcopyd_exit(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index d107ddceefc..268547dbfbd 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
rdev_for_each(rdev, tmp, mddev)
conf->rdev = rdev;
- mddev->array_size = mddev->size;
+ mddev->array_sectors = mddev->size * 2;
mddev->private = conf;
reconfig(mddev, mddev->layout, -1);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 0b8511776b3..b1eebf88c20 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
/**
* linear_mergeable_bvec -- tell bio layer if two requests can be merged
* @q: request queue
- * @bio: the buffer head that's been built up so far
+ * @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can take at this offset
*/
-static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int linear_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
{
mddev_t *mddev = q->queuedata;
dev_info_t *dev0;
- unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
- sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
dev0 = which_dev(mddev, sector);
maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
@@ -120,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
return NULL;
cnt = 0;
- conf->array_size = 0;
+ conf->array_sectors = 0;
rdev_for_each(rdev, tmp, mddev) {
int j = rdev->raid_disk;
dev_info_t *disk = conf->disks + j;
- if (j < 0 || j > raid_disks || disk->rdev) {
+ if (j < 0 || j >= raid_disks || disk->rdev) {
printk("linear: disk numbering problem. Aborting!\n");
goto out;
}
@@ -144,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->size = rdev->size;
- conf->array_size += rdev->size;
+ conf->array_sectors += rdev->size * 2;
cnt++;
}
@@ -153,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
goto out;
}
- min_spacing = conf->array_size;
+ min_spacing = conf->array_sectors / 2;
sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
/* min_spacing is the minimum spacing that will fit the hash
@@ -162,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
* that is larger than min_spacing as use the size of that as
* the actual spacing
*/
- conf->hash_spacing = conf->array_size;
+ conf->hash_spacing = conf->array_sectors / 2;
for (i=0; i < cnt-1 ; i++) {
sector_t sz = 0;
int j;
@@ -192,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
unsigned round;
unsigned long base;
- sz = conf->array_size >> conf->preshift;
+ sz = conf->array_sectors >> (conf->preshift + 1);
sz += 1; /* force round-up */
base = conf->hash_spacing >> conf->preshift;
round = sector_div(sz, base);
@@ -219,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
curr_offset = 0;
i = 0;
for (curr_offset = 0;
- curr_offset < conf->array_size;
+ curr_offset < conf->array_sectors / 2;
curr_offset += conf->hash_spacing) {
while (i < raid_disks-1 &&
@@ -250,12 +252,13 @@ static int linear_run (mddev_t *mddev)
{
linear_conf_t *conf;
+ mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = linear_conf(mddev, mddev->raid_disks);
if (!conf)
return 1;
mddev->private = conf;
- mddev->array_size = conf->array_size;
+ mddev->array_sectors = conf->array_sectors;
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
@@ -289,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
newconf->prev = mddev_to_conf(mddev);
mddev->private = newconf;
mddev->raid_disks++;
- mddev->array_size = newconf->array_size;
- set_capacity(mddev->gendisk, mddev->array_size << 1);
+ mddev->array_sectors = newconf->array_sectors;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 83eb78b0013..c2ff77ccec5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -74,6 +74,8 @@ static DEFINE_SPINLOCK(pers_lock);
static void md_print_devices(void);
+static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
/*
@@ -167,7 +169,6 @@ void md_new_event(mddev_t *mddev)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters);
- sysfs_notify(&mddev->kobj, NULL, "sync_action");
}
EXPORT_SYMBOL_GPL(md_new_event);
@@ -272,9 +273,12 @@ static mddev_t * mddev_find(dev_t unit)
INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer);
atomic_set(&new->active, 1);
+ atomic_set(&new->openers, 0);
spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait);
+ init_waitqueue_head(&new->recovery_wait);
new->reshape_position = MaxSector;
+ new->resync_min = 0;
new->resync_max = MaxSector;
new->level = LEVEL_NONE;
@@ -344,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel)
return NULL;
}
+/* return the offset of the super block in 512byte sectors */
static inline sector_t calc_dev_sboffset(struct block_device *bdev)
{
- sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
- return MD_NEW_SIZE_BLOCKS(size);
+ sector_t num_sectors = bdev->bd_inode->i_size / 512;
+ return MD_NEW_SIZE_SECTORS(num_sectors);
}
-static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
{
- sector_t size;
-
- size = rdev->sb_offset;
+ sector_t num_sectors = rdev->sb_start;
if (chunk_size)
- size &= ~((sector_t)chunk_size/1024 - 1);
- return size;
+ num_sectors &= ~((sector_t)chunk_size/512 - 1);
+ return num_sectors;
}
static int alloc_disk_sb(mdk_rdev_t * rdev)
@@ -369,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
rdev->sb_page = alloc_page(GFP_KERNEL);
if (!rdev->sb_page) {
printk(KERN_ALERT "md: out of memory.\n");
- return -EINVAL;
+ return -ENOMEM;
}
return 0;
@@ -381,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
put_page(rdev->sb_page);
rdev->sb_loaded = 0;
rdev->sb_page = NULL;
- rdev->sb_offset = 0;
+ rdev->sb_start = 0;
rdev->size = 0;
}
}
@@ -527,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0;
- if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
+ if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
goto fail;
rdev->sb_loaded = 1;
return 0;
@@ -540,17 +543,12 @@ fail:
static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
- if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
- (sb1->set_uuid1 == sb2->set_uuid1) &&
- (sb1->set_uuid2 == sb2->set_uuid2) &&
- (sb1->set_uuid3 == sb2->set_uuid3))
-
- return 1;
-
- return 0;
+ return sb1->set_uuid0 == sb2->set_uuid0 &&
+ sb1->set_uuid1 == sb2->set_uuid1 &&
+ sb1->set_uuid2 == sb2->set_uuid2 &&
+ sb1->set_uuid3 == sb2->set_uuid3;
}
-
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
int ret;
@@ -561,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
if (!tmp1 || !tmp2) {
ret = 0;
- printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
goto abort;
}
@@ -574,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
tmp1->nr_disks = 0;
tmp2->nr_disks = 0;
- if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
- ret = 0;
- else
- ret = 1;
-
+ ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
abort:
kfree(tmp1);
kfree(tmp2);
@@ -655,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
*/
struct super_type {
- char *name;
- struct module *owner;
- int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
- int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
- void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ char *name;
+ struct module *owner;
+ int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
+ int minor_version);
+ int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
+ sector_t num_sectors);
};
/*
@@ -670,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
mdp_super_t *sb;
int ret;
- sector_t sb_offset;
/*
- * Calculate the position of the superblock,
+ * Calculate the position of the superblock (512byte sectors),
* it's at the end of the disk.
*
* It also happens to be a multiple of 4Kb.
*/
- sb_offset = calc_dev_sboffset(rdev->bdev);
- rdev->sb_offset = sb_offset;
+ rdev->sb_start = calc_dev_sboffset(rdev->bdev);
ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
@@ -756,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
else
ret = 0;
}
- rdev->size = calc_dev_size(rdev, sb->chunk_size);
+ rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
if (rdev->size < sb->size && sb->level > 1)
/* "this cannot possibly happen" ... */
@@ -1001,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
}
/*
+ * rdev_size_change for 0.90.0
+ */
+static unsigned long long
+super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
+{
+ if (num_sectors && num_sectors < rdev->mddev->size * 2)
+ return 0; /* component must fit device */
+ if (rdev->mddev->bitmap_offset)
+ return 0; /* can't move bitmap */
+ rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ if (!num_sectors || num_sectors > rdev->sb_start)
+ num_sectors = rdev->sb_start;
+ md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+ rdev->sb_page);
+ md_super_wait(rdev->mddev);
+ return num_sectors / 2; /* kB for sysfs */
+}
+
+
+/*
* version 1 superblock
*/
@@ -1031,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
int ret;
- sector_t sb_offset;
+ sector_t sb_start;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask;
/*
- * Calculate the position of the superblock.
+ * Calculate the position of the superblock in 512byte sectors.
* It is always aligned to a 4K boundary and
* depeding on minor_version, it can be:
* 0: At least 8K, but less than 12K, from end of device
@@ -1045,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
*/
switch(minor_version) {
case 0:
- sb_offset = rdev->bdev->bd_inode->i_size >> 9;
- sb_offset -= 8*2;
- sb_offset &= ~(sector_t)(4*2-1);
- /* convert from sectors to K */
- sb_offset /= 2;
+ sb_start = rdev->bdev->bd_inode->i_size >> 9;
+ sb_start -= 8*2;
+ sb_start &= ~(sector_t)(4*2-1);
break;
case 1:
- sb_offset = 0;
+ sb_start = 0;
break;
case 2:
- sb_offset = 4;
+ sb_start = 8;
break;
default:
return -EINVAL;
}
- rdev->sb_offset = sb_offset;
+ rdev->sb_start = sb_start;
/* superblock is rarely larger than 1K, but it can be larger,
* and it is safe to read 4k, so we do that
@@ -1074,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
sb->major_version != cpu_to_le32(1) ||
le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
- le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+ le64_to_cpu(sb->super_offset) != rdev->sb_start ||
(le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
return -EINVAL;
@@ -1110,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
rdev->sb_size = (rdev->sb_size | bmask) + 1;
if (minor_version
- && rdev->data_offset < sb_offset + (rdev->sb_size/512))
+ && rdev->data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
@@ -1146,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (minor_version)
rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
else
- rdev->size = rdev->sb_offset;
+ rdev->size = rdev->sb_start / 2;
if (rdev->size < le64_to_cpu(sb->data_size)/2)
return -EINVAL;
rdev->size = le64_to_cpu(sb->data_size)/2;
@@ -1325,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->sb_csum = calc_sb_1_csum(sb);
}
+static unsigned long long
+super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
+{
+ struct mdp_superblock_1 *sb;
+ sector_t max_sectors;
+ if (num_sectors && num_sectors < rdev->mddev->size * 2)
+ return 0; /* component must fit device */
+ if (rdev->sb_start < rdev->data_offset) {
+ /* minor versions 1 and 2; superblock before data */
+ max_sectors = rdev->bdev->bd_inode->i_size >> 9;
+ max_sectors -= rdev->data_offset;
+ if (!num_sectors || num_sectors > max_sectors)
+ num_sectors = max_sectors;
+ } else if (rdev->mddev->bitmap_offset) {
+ /* minor version 0 with bitmap we can't move */
+ return 0;
+ } else {
+ /* minor version 0; superblock after data */
+ sector_t sb_start;
+ sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+ sb_start &= ~(sector_t)(4*2 - 1);
+ max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
+ if (!num_sectors || num_sectors > max_sectors)
+ num_sectors = max_sectors;
+ rdev->sb_start = sb_start;
+ }
+ sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
+ sb->data_size = cpu_to_le64(num_sectors);
+ sb->super_offset = rdev->sb_start;
+ sb->sb_csum = calc_sb_1_csum(sb);
+ md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+ rdev->sb_page);
+ md_super_wait(rdev->mddev);
+ return num_sectors / 2; /* kB for sysfs */
+}
static struct super_type super_types[] = {
[0] = {
.name = "0.90.0",
.owner = THIS_MODULE,
- .load_super = super_90_load,
- .validate_super = super_90_validate,
- .sync_super = super_90_sync,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ .rdev_size_change = super_90_rdev_size_change,
},
[1] = {
.name = "md-1",
.owner = THIS_MODULE,
- .load_super = super_1_load,
- .validate_super = super_1_validate,
- .sync_super = super_1_sync,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ .rdev_size_change = super_1_rdev_size_change,
},
};
static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
{
- struct list_head *tmp, *tmp2;
mdk_rdev_t *rdev, *rdev2;
- rdev_for_each(rdev, tmp, mddev1)
- rdev_for_each(rdev2, tmp2, mddev2)
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev1)
+ rdev_for_each_rcu(rdev2, mddev2)
if (rdev->bdev->bd_contains ==
- rdev2->bdev->bd_contains)
+ rdev2->bdev->bd_contains) {
+ rcu_read_unlock();
return 1;
-
+ }
+ rcu_read_unlock();
return 0;
}
@@ -1420,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
kobject_del(&rdev->kobj);
goto fail;
}
- list_add(&rdev->same_set, &mddev->disks);
+ list_add_rcu(&rdev->same_set, &mddev->disks);
bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
return 0;
@@ -1445,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
return;
}
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
- list_del_init(&rdev->same_set);
+ list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
/* We need to delay this, otherwise we can deadlock when
- * writing to 'remove' to "dev/state"
+ * writing to 'remove' to "dev/state". We also need
+ * to delay it due to rcu usage.
*/
+ synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj);
schedule_work(&rdev->del_work);
@@ -1508,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
if (rdev->mddev)
MD_BUG();
free_disk_sb(rdev);
- list_del_init(&rdev->same_set);
#ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
@@ -1755,11 +1808,11 @@ repeat:
dprintk("%s ", bdevname(rdev->bdev,b));
if (!test_bit(Faulty, &rdev->flags)) {
md_super_write(mddev,rdev,
- rdev->sb_offset<<1, rdev->sb_size,
+ rdev->sb_start, rdev->sb_size,
rdev->sb_page);
dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
bdevname(rdev->bdev,b),
- (unsigned long long)rdev->sb_offset);
+ (unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
} else
@@ -1784,7 +1837,7 @@ repeat:
}
-/* words written to sysfs files may, or my not, be \n terminated.
+/* words written to sysfs files may, or may not, be \n terminated.
* We want to accept with case. For this we use cmd_match.
*/
static int cmd_match(const char *cmd, const char *str)
@@ -1883,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
err = 0;
}
+ if (!err)
+ sysfs_notify(&rdev->kobj, NULL, "state");
return err ? err : len;
}
static struct rdev_sysfs_entry rdev_state =
@@ -1928,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
slot = -1;
else if (e==buf || (*e && *e!= '\n'))
return -EINVAL;
- if (rdev->mddev->pers) {
+ if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
* with the personality with ->hot_*_disk.
@@ -1936,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
* failed/spare devices. This normally happens automatically,
* but not when the metadata is externally managed.
*/
- if (slot != -1)
- return -EBUSY;
if (rdev->raid_disk == -1)
return -EEXIST;
/* personality does all needed checks */
@@ -1951,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
sysfs_remove_link(&rdev->mddev->kobj, nm);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
+ } else if (rdev->mddev->pers) {
+ mdk_rdev_t *rdev2;
+ struct list_head *tmp;
+ /* Activating a spare .. or possibly reactivating
+ * if we every get bitmaps working here.
+ */
+
+ if (rdev->raid_disk != -1)
+ return -EBUSY;
+
+ if (rdev->mddev->pers->hot_add_disk == NULL)
+ return -EINVAL;
+
+ rdev_for_each(rdev2, tmp, rdev->mddev)
+ if (rdev2->raid_disk == slot)
+ return -EEXIST;
+
+ rdev->raid_disk = slot;
+ if (test_bit(In_sync, &rdev->flags))
+ rdev->saved_raid_disk = slot;
+ else
+ rdev->saved_raid_disk = -1;
+ err = rdev->mddev->pers->
+ hot_add_disk(rdev->mddev, rdev);
+ if (err) {
+ rdev->raid_disk = -1;
+ return err;
+ } else
+ sysfs_notify(&rdev->kobj, NULL, "state");
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+ printk(KERN_WARNING
+ "md: cannot register "
+ "%s for %s\n",
+ nm, mdname(rdev->mddev));
+
+ /* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks)
return -ENOSPC;
@@ -1959,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
clear_bit(Faulty, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
set_bit(In_sync, &rdev->flags);
+ sysfs_notify(&rdev->kobj, NULL, "state");
}
return len;
}
@@ -1980,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
unsigned long long offset = simple_strtoull(buf, &e, 10);
if (e==buf || (*e && *e != '\n'))
return -EINVAL;
- if (rdev->mddev->pers)
+ if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY;
if (rdev->size && rdev->mddev->external)
/* Must set offset before size, so overlap checks
@@ -2012,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
static ssize_t
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long long size = simple_strtoull(buf, &e, 10);
+ unsigned long long size;
unsigned long long oldsize = rdev->size;
mddev_t *my_mddev = rdev->mddev;
- if (e==buf || (*e && *e != '\n'))
+ if (strict_strtoull(buf, 10, &size) < 0)
return -EINVAL;
- if (my_mddev->pers)
- return -EBUSY;
+ if (size < my_mddev->size)
+ return -EINVAL;
+ if (my_mddev->pers && rdev->raid_disk >= 0) {
+ if (my_mddev->persistent) {
+ size = super_types[my_mddev->major_version].
+ rdev_size_change(rdev, size * 2);
+ if (!size)
+ return -EBUSY;
+ } else if (!size) {
+ size = (rdev->bdev->bd_inode->i_size >> 10);
+ size -= rdev->data_offset/2;
+ }
+ if (size < my_mddev->size)
+ return -EINVAL; /* component must fit device */
+ }
+
rdev->size = size;
- if (size > oldsize && rdev->mddev->external) {
+ if (size > oldsize && my_mddev->external) {
/* need to check that all other rdevs with the same ->bdev
* do not overlap. We need to unlock the mddev to avoid
* a deadlock. We have already changed rdev->size, and if
@@ -2041,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (test_bit(AllReserved, &rdev2->flags) ||
(rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->size,
- rdev2->data_offset, rdev2->size))) {
+ overlaps(rdev->data_offset, rdev->size * 2,
+ rdev2->data_offset,
+ rdev2->size * 2))) {
overlap = 1;
break;
}
@@ -2064,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EBUSY;
}
}
- if (size < my_mddev->size || my_mddev->size == 0)
- my_mddev->size = size;
return len;
}
@@ -2509,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
* When written, doesn't tear down array, but just stops it
* suspended (not supported yet)
* All IO requests will block. The array can be reconfigured.
- * Writing this, if accepted, will block until array is quiessent
+ * Writing this, if accepted, will block until array is quiescent
* readonly
* no resync can happen. no superblocks get written.
* write requests fail
@@ -2582,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]);
}
-static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_stop(mddev_t * mddev, int ro, int is_open);
static int do_md_run(mddev_t * mddev);
static int restart_array(mddev_t *mddev);
@@ -2596,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break;
case clear:
/* stopping an active array */
- if (atomic_read(&mddev->active) > 1)
+ if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
- err = do_md_stop(mddev, 0);
+ err = do_md_stop(mddev, 0, 0);
break;
case inactive:
/* stopping an active array */
if (mddev->pers) {
- if (atomic_read(&mddev->active) > 1)
+ if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
- err = do_md_stop(mddev, 2);
+ err = do_md_stop(mddev, 2, 0);
} else
err = 0; /* already inactive */
break;
@@ -2613,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break; /* not supported yet */
case readonly:
if (mddev->pers)
- err = do_md_stop(mddev, 1);
+ err = do_md_stop(mddev, 1, 0);
else {
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
@@ -2623,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case read_auto:
if (mddev->pers) {
if (mddev->ro != 1)
- err = do_md_stop(mddev, 1);
+ err = do_md_stop(mddev, 1, 0);
else
err = restart_array(mddev);
if (err == 0) {
@@ -2678,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
}
if (err)
return err;
- else
+ else {
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
return len;
+ }
}
static struct md_sysfs_entry md_array_state =
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -2782,7 +2887,7 @@ size_show(mddev_t *mddev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
}
-static int update_size(mddev_t *mddev, unsigned long size);
+static int update_size(mddev_t *mddev, sector_t num_sectors);
static ssize_t
size_store(mddev_t *mddev, const char *buf, size_t len)
@@ -2799,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
return -EINVAL;
if (mddev->pers) {
- err = update_size(mddev, size);
+ err = update_size(mddev, size * 2);
md_update_sb(mddev, 1);
} else {
if (mddev->size == 0 ||
@@ -2896,7 +3001,7 @@ action_show(mddev_t *mddev, char *page)
type = "check";
else
type = "repair";
- } else
+ } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
type = "recover";
}
return sprintf(page, "%s\n", type);
@@ -2918,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY;
- else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+ else if (cmd_match(page, "resync"))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ else if (cmd_match(page, "recover")) {
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- else if (cmd_match(page, "reshape")) {
+ } else if (cmd_match(page, "reshape")) {
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
err = mddev->pers->start_reshape(mddev);
if (err)
return err;
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
} else {
if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2937,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
return len;
}
@@ -3013,14 +3123,44 @@ degraded_show(mddev_t *mddev, char *page)
static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
static ssize_t
+sync_force_parallel_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->parallel_resync);
+}
+
+static ssize_t
+sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ long n;
+
+ if (strict_strtol(buf, 10, &n))
+ return -EINVAL;
+
+ if (n != 0 && n != 1)
+ return -EINVAL;
+
+ mddev->parallel_resync = n;
+
+ if (mddev->sync_thread)
+ wake_up(&resync_wait);
+
+ return len;
+}
+
+/* force parallel resync, even with shared block devices */
+static struct md_sysfs_entry md_sync_force_parallel =
+__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
+ sync_force_parallel_show, sync_force_parallel_store);
+
+static ssize_t
sync_speed_show(mddev_t *mddev, char *page)
{
unsigned long resync, dt, db;
- resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
- dt = ((jiffies - mddev->resync_mark) / HZ);
+ resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
+ dt = (jiffies - mddev->resync_mark) / HZ;
if (!dt) dt++;
- db = resync - (mddev->resync_mark_cnt);
- return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+ db = resync - mddev->resync_mark_cnt;
+ return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
}
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
@@ -3042,6 +3182,36 @@ sync_completed_show(mddev_t *mddev, char *page)
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
static ssize_t
+min_sync_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->resync_min);
+}
+static ssize_t
+min_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ unsigned long long min;
+ if (strict_strtoull(buf, 10, &min))
+ return -EINVAL;
+ if (min > mddev->resync_max)
+ return -EINVAL;
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return -EBUSY;
+
+ /* Must be a multiple of chunk_size */
+ if (mddev->chunk_size) {
+ if (min & (sector_t)((mddev->chunk_size>>9)-1))
+ return -EINVAL;
+ }
+ mddev->resync_min = min;
+
+ return len;
+}
+
+static struct md_sysfs_entry md_min_sync =
+__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
+
+static ssize_t
max_sync_show(mddev_t *mddev, char *page)
{
if (mddev->resync_max == MaxSector)
@@ -3056,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
if (strncmp(buf, "max", 3) == 0)
mddev->resync_max = MaxSector;
else {
- char *ep;
- unsigned long long max = simple_strtoull(buf, &ep, 10);
- if (ep == buf || (*ep != 0 && *ep != '\n'))
+ unsigned long long max;
+ if (strict_strtoull(buf, 10, &max))
+ return -EINVAL;
+ if (max < mddev->resync_min)
return -EINVAL;
if (max < mddev->resync_max &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3187,7 +3358,9 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_min.attr,
&md_sync_max.attr,
&md_sync_speed.attr,
+ &md_sync_force_parallel.attr,
&md_sync_completed.attr,
+ &md_min_sync.attr,
&md_max_sync.attr,
&md_suspend_lo.attr,
&md_suspend_hi.attr,
@@ -3292,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
disk->queue = mddev->queue;
add_disk(disk);
mddev->gendisk = disk;
- mutex_unlock(&disks_mutex);
error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
"%s", "md");
+ mutex_unlock(&disks_mutex);
if (error)
printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
disk->disk_name);
@@ -3307,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data)
{
mddev_t *mddev = (mddev_t *) data;
- mddev->safemode = 1;
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev->safemode = 1;
+ if (mddev->external)
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
+ }
md_wakeup_thread(mddev->thread);
}
@@ -3398,22 +3575,23 @@ static int do_md_run(mddev_t * mddev)
* We don't want the data to overlap the metadata,
* Internal Bitmap issues has handled elsewhere.
*/
- if (rdev->data_offset < rdev->sb_offset) {
+ if (rdev->data_offset < rdev->sb_start) {
if (mddev->size &&
rdev->data_offset + mddev->size*2
- > rdev->sb_offset*2) {
+ > rdev->sb_start) {
printk("md: %s: data overlaps metadata\n",
mdname(mddev));
return -EINVAL;
}
} else {
- if (rdev->sb_offset*2 + rdev->sb_size/512
+ if (rdev->sb_start + rdev->sb_size/512
> rdev->data_offset) {
printk("md: %s: metadata overlaps data\n",
mdname(mddev));
return -EINVAL;
}
}
+ sysfs_notify(&rdev->kobj, NULL, "state");
}
md_probe(mddev->unit, NULL, NULL);
@@ -3485,7 +3663,9 @@ static int do_md_run(mddev_t * mddev)
mddev->ro = 2; /* read-only, but switch on first write */
err = mddev->pers->run(mddev);
- if (!err && mddev->pers->sync_request) {
+ if (err)
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ else if (mddev->pers->sync_request) {
err = bitmap_create(mddev);
if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3494,7 +3674,6 @@ static int do_md_run(mddev_t * mddev)
}
}
if (err) {
- printk(KERN_ERR "md: pers->run() failed ...\n");
module_put(mddev->pers->owner);
mddev->pers = NULL;
bitmap_destroy(mddev);
@@ -3529,7 +3708,7 @@ static int do_md_run(mddev_t * mddev)
if (mddev->flags)
md_update_sb(mddev, 0);
- set_capacity(disk, mddev->array_size<<1);
+ set_capacity(disk, mddev->array_sectors);
/* If we call blk_queue_make_request here, it will
* re-initialise max_sectors etc which may have been
@@ -3574,6 +3753,9 @@ static int do_md_run(mddev_t * mddev)
mddev->changed = 1;
md_new_event(mddev);
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
return 0;
}
@@ -3581,38 +3763,25 @@ static int do_md_run(mddev_t * mddev)
static int restart_array(mddev_t *mddev)
{
struct gendisk *disk = mddev->gendisk;
- int err;
- /*
- * Complain if it has no devices
- */
- err = -ENXIO;
+ /* Complain if it has no devices */
if (list_empty(&mddev->disks))
- goto out;
-
- if (mddev->pers) {
- err = -EBUSY;
- if (!mddev->ro)
- goto out;
-
- mddev->safemode = 0;
- mddev->ro = 0;
- set_disk_ro(disk, 0);
-
- printk(KERN_INFO "md: %s switched to read-write mode.\n",
- mdname(mddev));
- /*
- * Kick recovery or resync if necessary
- */
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
- md_wakeup_thread(mddev->sync_thread);
- err = 0;
- } else
- err = -EINVAL;
-
-out:
- return err;
+ return -ENXIO;
+ if (!mddev->pers)
+ return -EINVAL;
+ if (!mddev->ro)
+ return -EBUSY;
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+ printk(KERN_INFO "md: %s switched to read-write mode.\n",
+ mdname(mddev));
+ /* Kick recovery or resync if necessary */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread);
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
+ return 0;
}
/* similar to deny_write_access, but accounts for our holding a reference
@@ -3646,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file)
* 1 - switch to readonly
* 2 - stop but do not disassemble array
*/
-static int do_md_stop(mddev_t * mddev, int mode)
+static int do_md_stop(mddev_t * mddev, int mode, int is_open)
{
int err = 0;
struct gendisk *disk = mddev->gendisk;
+ if (atomic_read(&mddev->openers) > is_open) {
+ printk("md: %s still in use.\n",mdname(mddev));
+ return -EBUSY;
+ }
+
if (mddev->pers) {
- if (atomic_read(&mddev->active)>2) {
- printk("md: %s still in use.\n",mdname(mddev));
- return -EBUSY;
- }
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -3691,6 +3861,8 @@ static int do_md_stop(mddev_t * mddev, int mode)
module_put(mddev->pers->owner);
mddev->pers = NULL;
+ /* tell userspace to handle 'inactive' */
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
set_capacity(disk, 0);
mddev->changed = 1;
@@ -3737,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode)
export_array(mddev);
- mddev->array_size = 0;
+ mddev->array_sectors = 0;
mddev->size = 0;
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
+ mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector;
mddev->external = 0;
@@ -3775,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
mdname(mddev));
err = 0;
md_new_event(mddev);
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
out:
return err;
}
@@ -3800,7 +3974,7 @@ static void autorun_array(mddev_t *mddev)
err = do_md_run (mddev);
if (err) {
printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
- do_md_stop (mddev, 0);
+ do_md_stop (mddev, 0, 0);
}
}
@@ -3861,8 +4035,10 @@ static void autorun_devices(int part)
md_probe(dev, NULL, NULL);
mddev = mddev_find(dev);
- if (!mddev) {
- printk(KERN_ERR
+ if (!mddev || !mddev->gendisk) {
+ if (mddev)
+ mddev_put(mddev);
+ printk(KERN_ERR
"md: cannot allocate memory for md drive.\n");
break;
}
@@ -3889,8 +4065,10 @@ static void autorun_devices(int part)
/* on success, candidates will be empty, on error
* it won't...
*/
- rdev_for_each_list(rdev, tmp, candidates)
+ rdev_for_each_list(rdev, tmp, candidates) {
+ list_del_init(&rdev->same_set);
export_rdev(rdev);
+ }
mddev_put(mddev);
}
printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -3971,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
char *ptr, *buf = NULL;
int err = -ENOMEM;
- md_allow_write(mddev);
+ if (md_allow_write(mddev))
+ file = kmalloc(sizeof(*file), GFP_NOIO);
+ else
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
- file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file)
goto out;
@@ -3987,8 +4167,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
if (!buf)
goto out;
- ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
- if (!ptr)
+ ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
+ if (IS_ERR(ptr))
goto out;
strcpy(file->pathname, ptr);
@@ -4006,15 +4186,12 @@ out:
static int get_disk_info(mddev_t * mddev, void __user * arg)
{
mdu_disk_info_t info;
- unsigned int nr;
mdk_rdev_t *rdev;
if (copy_from_user(&info, arg, sizeof(info)))
return -EFAULT;
- nr = info.number;
-
- rdev = find_rdev_nr(mddev, nr);
+ rdev = find_rdev_nr(mddev, info.number);
if (rdev) {
info.major = MAJOR(rdev->bdev->bd_dev);
info.minor = MINOR(rdev->bdev->bd_dev);
@@ -4134,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
}
if (err)
export_rdev(rdev);
+ else
+ sysfs_notify(&rdev->kobj, NULL, "state");
md_update_sb(mddev, 1);
+ if (mddev->degraded)
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return err;
@@ -4174,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (!mddev->persistent) {
printk(KERN_INFO "md: nonpersistent superblock ...\n");
- rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
} else
- rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
- rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+ rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
err = bind_rdev_to_array(rdev, mddev);
if (err) {
@@ -4194,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
char b[BDEVNAME_SIZE];
mdk_rdev_t *rdev;
- if (!mddev->pers)
- return -ENODEV;
-
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
@@ -4219,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
int err;
- unsigned int size;
mdk_rdev_t *rdev;
if (!mddev->pers)
@@ -4247,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}
if (mddev->persistent)
- rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev->bdev);
else
- rdev->sb_offset =
- rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
- size = calc_dev_size(rdev, mddev->chunk_size);
- rdev->size = size;
+ rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
if (test_bit(Faulty, &rdev->flags)) {
printk(KERN_WARNING
@@ -4438,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
return 0;
}
-static int update_size(mddev_t *mddev, unsigned long size)
+static int update_size(mddev_t *mddev, sector_t num_sectors)
{
mdk_rdev_t * rdev;
int rv;
struct list_head *tmp;
- int fit = (size == 0);
+ int fit = (num_sectors == 0);
if (mddev->pers->resize == NULL)
return -EINVAL;
- /* The "size" is the amount of each device that is used.
- * This can only make sense for arrays with redundancy.
- * linear and raid0 always use whatever space is available
- * We can only consider changing the size if no resync
- * or reconstruction is happening, and if the new size
- * is acceptable. It must fit before the sb_offset or,
- * if that is <data_offset, it must fit before the
- * size of each device.
- * If size is zero, we find the largest size that fits.
+ /* The "num_sectors" is the number of sectors of each device that
+ * is used. This can only make sense for arrays with redundancy.
+ * linear and raid0 always use whatever space is available. We can only
+ * consider changing this number if no resync or reconstruction is
+ * happening, and if the new size is acceptable. It must fit before the
+ * sb_start or, if that is <data_offset, it must fit before the size
+ * of each device. If num_sectors is zero, we find the largest size
+ * that fits.
+
*/
if (mddev->sync_thread)
return -EBUSY;
@@ -4463,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size)
sector_t avail;
avail = rdev->size * 2;
- if (fit && (size == 0 || size > avail/2))
- size = avail/2;
- if (avail < ((sector_t)size << 1))
+ if (fit && (num_sectors == 0 || num_sectors > avail))
+ num_sectors = avail;
+ if (avail < num_sectors)
return -ENOSPC;
}
- rv = mddev->pers->resize(mddev, (sector_t)size *2);
+ rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) {
struct block_device *bdev;
bdev = bdget_disk(mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
+ i_size_write(bdev->bd_inode,
+ (loff_t)mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
@@ -4550,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
return mddev->pers->reconfig(mddev, info->layout, -1);
}
if (info->size >= 0 && mddev->size != info->size)
- rv = update_size(mddev, info->size);
+ rv = update_size(mddev, (sector_t)info->size * 2);
if (mddev->raid_disks != info->raid_disks)
rv = update_raid_disks(mddev, info->raid_disks);
@@ -4603,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
return 0;
}
+/*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
mddev_t *mddev = bdev->bd_disk->private_data;
@@ -4747,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto done_unlock;
case STOP_ARRAY:
- err = do_md_stop (mddev, 0);
+ err = do_md_stop (mddev, 0, 1);
goto done_unlock;
case STOP_ARRAY_RO:
- err = do_md_stop (mddev, 1);
+ err = do_md_stop (mddev, 1, 1);
goto done_unlock;
- /*
- * We have a problem here : there is no easy way to give a CHS
- * virtual geometry. We currently pretend that we have a 2 heads
- * 4 sectors (with a BIG number of cylinders...). This drives
- * dosfs just mad... ;-)
- */
}
/*
@@ -4769,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
* here and hit the 'default' below, so only disallow
* 'md' ioctls, and switch to rw mode if started auto-readonly.
*/
- if (_IOC_TYPE(cmd) == MD_MAJOR &&
- mddev->ro && mddev->pers) {
+ if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
if (mddev->ro == 2) {
mddev->ro = 0;
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
-
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
} else {
err = -EROFS;
goto abort_unlock;
@@ -4845,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file)
err = 0;
mddev_get(mddev);
+ atomic_inc(&mddev->openers);
mddev_unlock(mddev);
check_disk_change(inode->i_bdev);
@@ -4857,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file)
mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
BUG_ON(!mddev);
+ atomic_dec(&mddev->openers);
mddev_put(mddev);
return 0;
@@ -4991,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
+ if (mddev->degraded)
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ set_bit(StateChanged, &rdev->flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -5220,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (!list_empty(&mddev->disks)) {
if (mddev->pers)
seq_printf(seq, "\n %llu blocks",
- (unsigned long long)mddev->array_size);
+ (unsigned long long)
+ mddev->array_sectors / 2);
else
seq_printf(seq, "\n %llu blocks",
- (unsigned long long)size);
+ (unsigned long long)size);
}
if (mddev->persistent) {
if (mddev->major_version != 0 ||
@@ -5353,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
static int is_mddev_idle(mddev_t *mddev)
{
mdk_rdev_t * rdev;
- struct list_head *tmp;
int idle;
long curr_events;
idle = 1;
- rdev_for_each(rdev, tmp, mddev) {
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, sectors[0]) +
disk_stat_read(disk, sectors[1]) -
@@ -5390,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
idle = 0;
}
}
+ rcu_read_unlock();
return idle;
}
@@ -5399,7 +5581,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
atomic_sub(blocks, &mddev->recovery_active);
wake_up(&mddev->recovery_wait);
if (!ok) {
- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_wakeup_thread(mddev->thread);
// stop recovery, signal do_sync ....
}
@@ -5413,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
*/
void md_write_start(mddev_t *mddev, struct bio *bi)
{
+ int did_change = 0;
if (bio_data_dir(bi) != WRITE)
return;
@@ -5423,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
+ did_change = 1;
}
atomic_inc(&mddev->writes_pending);
if (mddev->safemode == 1)
@@ -5433,10 +5617,15 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
md_wakeup_thread(mddev->thread);
+ did_change = 1;
}
spin_unlock_irq(&mddev->write_lock);
}
- wait_event(mddev->sb_wait, mddev->flags==0);
+ if (did_change)
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
}
void md_write_end(mddev_t *mddev)
@@ -5454,13 +5643,18 @@ void md_write_end(mddev_t *mddev)
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
+ *
+ * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * is dropped, so return -EAGAIN after notifying userspace.
*/
-void md_allow_write(mddev_t *mddev)
+int md_allow_write(mddev_t *mddev)
{
if (!mddev->pers)
- return;
+ return 0;
if (mddev->ro)
- return;
+ return 0;
+ if (!mddev->pers->sync_request)
+ return 0;
spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) {
@@ -5471,13 +5665,17 @@ void md_allow_write(mddev_t *mddev)
mddev->safemode = 1;
spin_unlock_irq(&mddev->write_lock);
md_update_sb(mddev, 0);
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
} else
spin_unlock_irq(&mddev->write_lock);
+
+ if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ return -EAGAIN;
+ else
+ return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);
-static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
-
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
void md_do_sync(mddev_t *mddev)
@@ -5541,8 +5739,9 @@ void md_do_sync(mddev_t *mddev)
for_each_mddev(mddev2, tmp) {
if (mddev2 == mddev)
continue;
- if (mddev2->curr_resync &&
- match_mddev_units(mddev,mddev2)) {
+ if (!mddev->parallel_resync
+ && mddev2->curr_resync
+ && match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
if (mddev < mddev2 && mddev->curr_resync == 2) {
/* arbitrarily yield */
@@ -5579,9 +5778,11 @@ void md_do_sync(mddev_t *mddev)
max_sectors = mddev->resync_max_sectors;
mddev->resync_mismatches = 0;
/* we don't use the checkpoint if there's a bitmap */
- if (!mddev->bitmap &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ j = mddev->resync_min;
+ else if (!mddev->bitmap)
j = mddev->recovery_cp;
+
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->size << 1;
else {
@@ -5622,7 +5823,6 @@ void md_do_sync(mddev_t *mddev)
window/2,(unsigned long long) max_sectors/2);
atomic_set(&mddev->recovery_active, 0);
- init_waitqueue_head(&mddev->recovery_wait);
last_check = 0;
if (j>2) {
@@ -5647,7 +5847,7 @@ void md_do_sync(mddev_t *mddev)
sectors = mddev->pers->sync_request(mddev, j, &skipped,
currspeed < speed_min(mddev));
if (sectors == 0) {
- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
goto out;
}
@@ -5670,8 +5870,7 @@ void md_do_sync(mddev_t *mddev)
last_check = io_sectors;
- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
- test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
repeat:
@@ -5725,8 +5924,7 @@ void md_do_sync(mddev_t *mddev)
/* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
+ if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5753,6 +5951,7 @@ void md_do_sync(mddev_t *mddev)
skip:
mddev->curr_resync = 0;
+ mddev->resync_min = 0;
mddev->resync_max = MaxSector;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
wake_up(&resync_wait);
@@ -5795,11 +5994,15 @@ static int remove_and_add_spares(mddev_t *mddev)
}
if (mddev->degraded) {
- rdev_for_each(rdev, rtmp, mddev)
+ rdev_for_each(rdev, rtmp, mddev) {
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags))
+ spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
- if (mddev->pers->hot_add_disk(mddev,rdev)) {
+ if (mddev->pers->
+ hot_add_disk(mddev, rdev) == 0) {
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
if (sysfs_create_link(&mddev->kobj,
@@ -5813,6 +6016,7 @@ static int remove_and_add_spares(mddev_t *mddev)
} else
break;
}
+ }
}
return spares;
}
@@ -5826,7 +6030,7 @@ static int remove_and_add_spares(mddev_t *mddev)
* to do that as needed.
* When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
* "->recovery" and create a thread at ->sync_thread.
- * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * When the thread finishes it sets MD_RECOVERY_DONE
* and wakeups up this thread which will reap the thread and finish up.
* This thread also removes any faulty devices (with nr_pending == 0).
*
@@ -5873,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev)
int spares = 0;
if (!mddev->external) {
+ int did_change = 0;
spin_lock_irq(&mddev->write_lock);
if (mddev->safemode &&
!atomic_read(&mddev->writes_pending) &&
!mddev->in_sync &&
mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
+ did_change = 1;
if (mddev->persistent)
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
if (mddev->safemode == 1)
mddev->safemode = 0;
spin_unlock_irq(&mddev->write_lock);
+ if (did_change)
+ sysfs_notify(&mddev->kobj, NULL, "array_state");
}
if (mddev->flags)
md_update_sb(mddev, 0);
+ rdev_for_each(rdev, rtmp, mddev)
+ if (test_and_clear_bit(StateChanged, &rdev->flags))
+ sysfs_notify(&rdev->kobj, NULL, "state");
+
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -5901,11 +6113,12 @@ void md_check_recovery(mddev_t *mddev)
/* resync has finished, collect result */
md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL;
- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* success...*/
/* activate any spares */
- mddev->pers->spare_active(mddev);
+ if (mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
}
md_update_sb(mddev, 1);
@@ -5919,14 +6132,18 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery = 0;
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
md_new_event(mddev);
goto unlock;
}
+ /* Set RUNNING before clearing NEEDED to avoid
+ * any transients in the value of "sync_action".
+ */
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
/* Clear some bits that don't mean anything, but
* might be left set
*/
- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
@@ -5944,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev)
/* Cannot proceed */
goto unlock;
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if ((spares = remove_and_add_spares(mddev))) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
/* nothing to be done ... */
goto unlock;
if (mddev->pers->sync_request) {
- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (spares && mddev->bitmap && ! mddev->bitmap->file) {
/* We are adding a device or devices to an array
* which has the bitmap stored on all devices.
@@ -5973,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery = 0;
} else
md_wakeup_thread(mddev->sync_thread);
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
md_new_event(mddev);
}
unlock:
+ if (!mddev->sync_thread) {
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ if (test_and_clear_bit(MD_RECOVERY_RECOVER,
+ &mddev->recovery))
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
+ }
mddev_unlock(mddev);
}
}
@@ -6002,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this,
for_each_mddev(mddev, tmp)
if (mddev_trylock(mddev)) {
- do_md_stop (mddev, 1);
+ do_md_stop (mddev, 1, 0);
mddev_unlock(mddev);
}
/*
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 42ee1a2dc14..c4779ccba1c 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev->private;
struct request_queue *q;
- int found = 0;
+ int err = -EEXIST;
int path;
struct multipath_info *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
print_multipath_conf(conf);
- for (path=0; path<mddev->raid_disks; path++)
+ for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
q = rdev->bdev->bd_disk->queue;
blk_queue_stack_limits(mddev->queue, q);
@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
- found = 1;
+ err = 0;
+ break;
}
print_multipath_conf(conf);
- return found;
+
+ return err;
}
static int multipath_remove_disk(mddev_t *mddev, int number)
@@ -327,7 +334,8 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
if (rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
- printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number);
+ printk(KERN_ERR "hot-remove-disk, slot %d is identified"
+ " but is still operational!\n", number);
err = -EBUSY;
goto abort;
}
@@ -417,6 +425,7 @@ static int multipath_run (mddev_t *mddev)
* bookkeeping area. [whatever we allocate in multipath_run(),
* should be freed in multipath_stop()]
*/
+ mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
mddev->private = conf;
@@ -495,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->array_size = mddev->size;
+ mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 818b4828409..18361063566 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev)
/**
* raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
* @q: request queue
- * @bio: the buffer head that's been built up so far
+ * @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
*/
-static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid0_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
{
mddev_t *mddev = q->queuedata;
- sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_size >> 9;
- unsigned int bio_sectors = bio->bi_size >> 9;
+ unsigned int bio_sectors = bvm->bi_size >> 9;
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
if (max < 0) max = 0; /* bio_add cannot handle a negative return */
@@ -280,6 +282,7 @@ static int raid0_run (mddev_t *mddev)
(mddev->chunk_size>>1)-1);
blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
+ mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
if (!conf)
@@ -292,16 +295,16 @@ static int raid0_run (mddev_t *mddev)
goto out_free_conf;
/* calculate array device size */
- mddev->array_size = 0;
+ mddev->array_sectors = 0;
rdev_for_each(rdev, tmp, mddev)
- mddev->array_size += rdev->size;
+ mddev->array_sectors += rdev->size * 2;
printk("raid0 : md_size is %llu blocks.\n",
- (unsigned long long)mddev->array_size);
+ (unsigned long long)mddev->array_sectors / 2);
printk("raid0 : conf->hash_spacing is %llu blocks.\n",
(unsigned long long)conf->hash_spacing);
{
- sector_t s = mddev->array_size;
+ sector_t s = mddev->array_sectors / 2;
sector_t space = conf->hash_spacing;
int round;
conf->preshift = 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6778b7cb39b..03a5ab705c2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -773,7 +773,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
r1bio_t *r1_bio;
struct bio *read_bio;
int i, targets = 0, disks;
- struct bitmap *bitmap = mddev->bitmap;
+ struct bitmap *bitmap;
unsigned long flags;
struct bio_list bl;
struct page **behind_pages = NULL;
@@ -802,6 +802,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
wait_barrier(conf);
+ bitmap = mddev->bitmap;
+
disk_stat_inc(mddev->gendisk, ios[rw]);
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -1025,7 +1027,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
/*
* if recovery is running, make sure it aborts.
*/
- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1098,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev)
static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev->private;
- int found = 0;
+ int err = -EEXIST;
int mirror = 0;
mirror_info_t *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
- for (mirror=0; mirror < mddev->raid_disks; mirror++)
+ for (mirror = first; mirror <= last; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
@@ -1117,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
- found = 1;
+ err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
@@ -1128,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
}
print_conf(conf);
- return found;
+ return err;
}
static int raid1_remove_disk(mddev_t *mddev, int number)
@@ -1146,6 +1153,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
err = -EBUSY;
goto abort;
}
+ /* Only remove non-faulty devices is recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->degraded < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
p->rdev = NULL;
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
@@ -1282,6 +1297,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
} else {
/* fixup the bio for reuse */
+ int size;
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
@@ -1295,10 +1311,20 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- for (j = 0; j < vcnt ; j++)
- memcpy(page_address(sbio->bi_io_vec[j].bv_page),
+ size = sbio->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &sbio->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ memcpy(page_address(bi->bv_page),
page_address(pbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
+ }
}
}
@@ -1935,6 +1961,9 @@ static int run(mddev_t *mddev)
if (!conf->r1bio_pool)
goto out_no_mem;
+ spin_lock_init(&conf->device_lock);
+ mddev->queue->queue_lock = &conf->device_lock;
+
rdev_for_each(rdev, tmp, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
@@ -1958,7 +1987,6 @@ static int run(mddev_t *mddev)
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
- spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
spin_lock_init(&conf->resync_lock);
@@ -2015,7 +2043,7 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->array_size = mddev->size;
+ mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2077,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
- mddev->array_size = sectors>>1;
- set_capacity(mddev->gendisk, mddev->array_size << 1);
+ mddev->array_sectors = sectors;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
- if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
+ if (mddev->array_sectors / 2 > mddev->size &&
+ mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
- mddev->size = mddev->array_size;
+ mddev->size = mddev->array_sectors / 2;
mddev->resync_max_sectors = sectors;
return 0;
}
@@ -2108,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev)
conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks;
unsigned long flags;
- int d, d2;
+ int d, d2, err;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_size != mddev->new_chunk ||
@@ -2120,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev)
return -EINVAL;
}
- md_allow_write(mddev);
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
raid_disks = mddev->raid_disks + mddev->delta_disks;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5938fa96292..159535d7356 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
/**
* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
* @q: request queue
- * @bio: the buffer head that's been built up so far
+ * @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
* If near_copies == raid_disk, there are no striping issues,
* but in that case, the function isn't called at all.
*/
-static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio,
- struct bio_vec *bio_vec)
+static int raid10_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
{
mddev_t *mddev = q->queuedata;
- sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_size >> 9;
- unsigned int bio_sectors = bio->bi_size >> 9;
+ unsigned int bio_sectors = bvm->bi_size >> 9;
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
if (max < 0) max = 0; /* bio_add cannot handle a negative return */
- if (max <= bio_vec->bv_len && bio_sectors == 0)
- return bio_vec->bv_len;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
else
return max;
}
@@ -886,7 +887,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
*/
raid10_find_phys(conf, r10_bio);
retry_write:
- blocked_rdev = 0;
+ blocked_rdev = NULL;
rcu_read_lock();
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
@@ -1020,7 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
/*
* if recovery is running, make sure it aborts.
*/
- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1113,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev)
static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev->private;
- int found = 0;
+ int err = -EEXIST;
int mirror;
mirror_info_t *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
* very different from resync
*/
- return 0;
+ return -EBUSY;
if (!enough(conf))
- return 0;
+ return -EINVAL;
+
+ if (rdev->raid_disk)
+ first = last = rdev->raid_disk;
if (rdev->saved_raid_disk >= 0 &&
+ rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
else
- mirror = 0;
- for ( ; mirror < mddev->raid_disks; mirror++)
+ mirror = first;
+ for ( ; mirror <= last ; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
@@ -1145,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
- found = 1;
+ err = 0;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
@@ -1153,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
}
print_conf(conf);
- return found;
+ return err;
}
static int raid10_remove_disk(mddev_t *mddev, int number)
@@ -1171,6 +1178,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
err = -EBUSY;
goto abort;
}
+ /* Only remove faulty devices in recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ enough(conf)) {
+ err = -EBUSY;
+ goto abort;
+ }
p->rdev = NULL;
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
@@ -1237,6 +1252,7 @@ static void end_sync_write(struct bio *bio, int error)
if (!uptodate)
md_error(mddev, conf->mirrors[d].rdev);
+
update_head_pos(i, r10_bio);
while (atomic_dec_and_test(&r10_bio->remaining)) {
@@ -1844,7 +1860,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (rb2)
atomic_dec(&rb2->remaining);
r10_bio = rb2;
- if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ if (!test_and_set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery))
printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
mdname(mddev));
break;
@@ -2082,6 +2099,9 @@ static int run(mddev_t *mddev)
goto out_free_conf;
}
+ spin_lock_init(&conf->device_lock);
+ mddev->queue->queue_lock = &conf->device_lock;
+
rdev_for_each(rdev, tmp, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
@@ -2103,7 +2123,6 @@ static int run(mddev_t *mddev)
disk->head_position = 0;
}
- spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
spin_lock_init(&conf->resync_lock);
@@ -2125,6 +2144,8 @@ static int run(mddev_t *mddev)
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
+ if (disk->rdev)
+ conf->fullsync = 1;
}
}
@@ -2144,7 +2165,7 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->array_size = size << (conf->chunk_shift-1);
+ mddev->array_sectors = size << conf->chunk_shift;
mddev->resync_max_sectors = size << conf->chunk_shift;
mddev->queue->unplug_fn = raid10_unplug;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 087eee0cb80..55e7c56045a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -94,6 +94,8 @@
#define __inline__
#endif
+#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
+
#if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
@@ -113,15 +115,20 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi,
- test_bit(BIO_UPTODATE, &bi->bi_flags)
- ? 0 : -EIO);
+ bio_endio(bi, 0);
bi = return_bi;
}
}
static void print_raid5_conf (raid5_conf_t *conf);
+static int stripe_operations_active(struct stripe_head *sh)
+{
+ return sh->check_state || sh->reconstruct_state ||
+ test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
+ test_bit(STRIPE_COMPUTE_RUN, &sh->state);
+}
+
static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
{
if (atomic_dec_and_test(&sh->count)) {
@@ -141,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
}
md_wakeup_thread(conf->mddev->thread);
} else {
- BUG_ON(sh->ops.pending);
+ BUG_ON(stripe_operations_active(sh));
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -243,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
- BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+ BUG_ON(stripe_operations_active(sh));
CHECK_DEVLOCK();
pr_debug("init_stripe called, stripe %llu\n",
@@ -344,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
return sh;
}
-/* test_and_ack_op() ensures that we only dequeue an operation once */
-#define test_and_ack_op(op, pend) \
-do { \
- if (test_bit(op, &sh->ops.pending) && \
- !test_bit(op, &sh->ops.complete)) { \
- if (test_and_set_bit(op, &sh->ops.ack)) \
- clear_bit(op, &pend); \
- else \
- ack++; \
- } else \
- clear_bit(op, &pend); \
-} while (0)
-
-/* find new work to run, do not resubmit work that is already
- * in flight
- */
-static unsigned long get_stripe_work(struct stripe_head *sh)
-{
- unsigned long pending;
- int ack = 0;
-
- pending = sh->ops.pending;
-
- test_and_ack_op(STRIPE_OP_BIOFILL, pending);
- test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
- test_and_ack_op(STRIPE_OP_PREXOR, pending);
- test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
- test_and_ack_op(STRIPE_OP_POSTXOR, pending);
- test_and_ack_op(STRIPE_OP_CHECK, pending);
- if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
- ack++;
-
- sh->ops.count -= ack;
- if (unlikely(sh->ops.count < 0)) {
- printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
- "ops.complete: %#lx\n", pending, sh->ops.pending,
- sh->ops.ack, sh->ops.complete);
- BUG();
- }
-
- return pending;
-}
-
static void
raid5_end_read_request(struct bio *bi, int error);
static void
raid5_end_write_request(struct bio *bi, int error);
-static void ops_run_io(struct stripe_head *sh)
+static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
raid5_conf_t *conf = sh->raid_conf;
int i, disks = sh->disks;
might_sleep();
- set_bit(STRIPE_IO_STARTED, &sh->state);
for (i = disks; i--; ) {
int rw;
struct bio *bi;
@@ -428,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
rcu_read_unlock();
if (rdev) {
- if (test_bit(STRIPE_SYNCING, &sh->state) ||
- test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
- test_bit(STRIPE_EXPAND_READY, &sh->state))
+ if (s->syncing || s->expanding || s->expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+ set_bit(STRIPE_IO_STARTED, &sh->state);
+
bi->bi_bdev = rdev->bdev;
pr_debug("%s: for %llu schedule op %ld on disc %d\n",
__func__, (unsigned long long)sh->sector,
@@ -526,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
(unsigned long long)sh->sector);
/* clear completed biofills */
+ spin_lock_irq(&conf->device_lock);
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* acknowledge completion of a biofill operation */
/* and check if we need to reply to a read request,
* new R5_Wantfill requests are held off until
- * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
+ * !STRIPE_BIOFILL_RUN
*/
if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
struct bio *rbi, *rbi2;
- /* The access to dev->read is outside of the
- * spin_lock_irq(&conf->device_lock), but is protected
- * by the STRIPE_OP_BIOFILL pending bit
- */
BUG_ON(!dev->read);
rbi = dev->read;
dev->read = NULL;
while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- spin_lock_irq(&conf->device_lock);
if (--rbi->bi_phys_segments == 0) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
- spin_unlock_irq(&conf->device_lock);
rbi = rbi2;
}
}
}
- set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
+ spin_unlock_irq(&conf->device_lock);
+ clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
return_io(return_bi);
@@ -608,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
set_bit(R5_UPTODATE, &tgt->flags);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
clear_bit(R5_Wantcompute, &tgt->flags);
- set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+ clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ if (sh->check_state == check_state_compute_run)
+ sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
-static struct dma_async_tx_descriptor *
-ops_run_compute5(struct stripe_head *sh, unsigned long pending)
+static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
@@ -644,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute5, sh);
- /* ack now if postxor is not set to be run */
- if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
- async_tx_ack(tx);
-
return tx;
}
@@ -657,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
-
- set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
}
static struct dma_async_tx_descriptor *
@@ -678,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* Only process blocks that are known to be uptodate */
- if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
+ if (test_bit(R5_Wantdrain, &dev->flags))
xor_srcs[count++] = dev->page;
}
@@ -690,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
}
static struct dma_async_tx_descriptor *
-ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
- unsigned long pending)
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
- int pd_idx = sh->pd_idx, i;
-
- /* check if prexor is active which means only process blocks
- * that are part of a read-modify-write (Wantprexor)
- */
- int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+ int i;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@@ -707,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
struct bio *chosen;
- int towrite;
- towrite = 0;
- if (prexor) { /* rmw */
- if (dev->towrite &&
- test_bit(R5_Wantprexor, &dev->flags))
- towrite = 1;
- } else { /* rcw */
- if (i != pd_idx && dev->towrite &&
- test_bit(R5_LOCKED, &dev->flags))
- towrite = 1;
- }
-
- if (towrite) {
+ if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi;
spin_lock(&sh->lock);
@@ -745,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
static void ops_complete_postxor(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
-
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
-
- set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
-}
-
-static void ops_complete_write(void *stripe_head_ref)
-{
- struct stripe_head *sh = stripe_head_ref;
int disks = sh->disks, i, pd_idx = sh->pd_idx;
pr_debug("%s: stripe %llu\n", __func__,
@@ -768,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
set_bit(R5_UPTODATE, &dev->flags);
}
- set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
- set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+ if (sh->reconstruct_state == reconstruct_state_drain_run)
+ sh->reconstruct_state = reconstruct_state_drain_result;
+ else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
+ sh->reconstruct_state = reconstruct_state_prexor_drain_result;
+ else {
+ BUG_ON(sh->reconstruct_state != reconstruct_state_run);
+ sh->reconstruct_state = reconstruct_state_result;
+ }
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
- unsigned long pending)
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
@@ -785,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
int count = 0, pd_idx = sh->pd_idx, i;
struct page *xor_dest;
- int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+ int prexor = 0;
unsigned long flags;
- dma_async_tx_callback callback;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@@ -795,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
- if (prexor) {
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ prexor = 1;
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -811,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
}
}
- /* check whether this postxor is part of a write */
- callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
- ops_complete_write : ops_complete_postxor;
-
/* 1/ if we prexor'd then the dest is reused as a source
* 2/ if we did not prexor then we are redoing the parity
* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -828,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
if (unlikely(count == 1)) {
flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
- flags, tx, callback, sh);
+ flags, tx, ops_complete_postxor, sh);
} else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
- flags, tx, callback, sh);
+ flags, tx, ops_complete_postxor, sh);
}
static void ops_complete_check(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- int pd_idx = sh->pd_idx;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
- if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
- sh->ops.zero_sum_result == 0)
- set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-
- set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+ sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
@@ -873,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
- if (tx)
- set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
- else
- clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-
atomic_inc(&sh->count);
tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
ops_complete_check, sh);
}
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
- if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
+ if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
ops_run_biofill(sh);
overlap_clear++;
}
- if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
- tx = ops_run_compute5(sh, pending);
+ if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
+ tx = ops_run_compute5(sh);
+ /* terminate the chain if postxor is not set to be run */
+ if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
+ async_tx_ack(tx);
+ }
- if (test_bit(STRIPE_OP_PREXOR, &pending))
+ if (test_bit(STRIPE_OP_PREXOR, &ops_request))
tx = ops_run_prexor(sh, tx);
- if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
- tx = ops_run_biodrain(sh, tx, pending);
+ if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
+ tx = ops_run_biodrain(sh, tx);
overlap_clear++;
}
- if (test_bit(STRIPE_OP_POSTXOR, &pending))
- ops_run_postxor(sh, tx, pending);
+ if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
+ ops_run_postxor(sh, tx);
- if (test_bit(STRIPE_OP_CHECK, &pending))
+ if (test_bit(STRIPE_OP_CHECK, &ops_request))
ops_run_check(sh);
- if (test_bit(STRIPE_OP_IO, &pending))
- ops_run_io(sh);
-
if (overlap_clear)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -995,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- int err = 0;
+ int err;
struct kmem_cache *sc;
int i;
if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */
- md_allow_write(conf->mddev);
+ err = md_allow_write(conf->mddev);
+ if (err)
+ return err;
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -1143,10 +1061,12 @@ static void raid5_end_read_request(struct bio * bi, int error)
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
rdev = conf->disks[i].rdev;
- printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
- mdname(conf->mddev), STRIPE_SECTORS,
- (unsigned long long)(sh->sector + rdev->data_offset),
- bdevname(rdev->bdev, b));
+ printk_rl(KERN_INFO "raid5:%s: read error corrected"
+ " (%lu sectors at %llu on %s)\n",
+ mdname(conf->mddev), STRIPE_SECTORS,
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdevname(rdev->bdev, b));
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
@@ -1160,16 +1080,22 @@ static void raid5_end_read_request(struct bio * bi, int error)
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded)
- printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
- mdname(conf->mddev),
- (unsigned long long)(sh->sector + rdev->data_offset),
- bdn);
+ printk_rl(KERN_WARNING
+ "raid5:%s: read error not correctable "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
- printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
- mdname(conf->mddev),
- (unsigned long long)(sh->sector + rdev->data_offset),
- bdn);
+ printk_rl(KERN_WARNING
+ "raid5:%s: read error NOT corrected!! "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdn);
else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
printk(KERN_WARNING
@@ -1258,7 +1184,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
/*
* if recovery was running, make sure it aborts.
*/
- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
set_bit(Faulty, &rdev->flags);
printk (KERN_ALERT
@@ -1693,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
}
}
-static int
-handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
+static void
+schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
+ int rcw, int expand)
{
int i, pd_idx = sh->pd_idx, disks = sh->disks;
- int locked = 0;
if (rcw) {
/* if we are not expanding this is a proper write request, and
@@ -1705,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
* stripe cache
*/
if (!expand) {
- set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
- sh->ops.count++;
- }
+ sh->reconstruct_state = reconstruct_state_drain_run;
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ } else
+ sh->reconstruct_state = reconstruct_state_run;
- set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
- sh->ops.count++;
+ set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->towrite) {
set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantdrain, &dev->flags);
if (!expand)
clear_bit(R5_UPTODATE, &dev->flags);
- locked++;
+ s->locked++;
}
}
- if (locked + 1 == disks)
+ if (s->locked + 1 == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&sh->raid_conf->pending_full_writes);
} else {
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
- set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
- set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
- set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-
- sh->ops.count += 3;
+ sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+ set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i == pd_idx)
continue;
- /* For a read-modify write there may be blocks that are
- * locked for reading while others are ready to be
- * written so we distinguish these blocks by the
- * R5_Wantprexor bit
- */
if (dev->towrite &&
(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Wantcompute, &dev->flags))) {
- set_bit(R5_Wantprexor, &dev->flags);
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ set_bit(R5_Wantdrain, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
clear_bit(R5_UPTODATE, &dev->flags);
- locked++;
+ s->locked++;
}
}
}
@@ -1761,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
*/
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
- locked++;
+ s->locked++;
- pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
+ pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
- locked, sh->ops.pending);
-
- return locked;
+ s->locked, s->ops_request);
}
/*
@@ -1866,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
}
static void
-handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
+handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks,
struct bio **return_bi)
{
@@ -1957,47 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
md_wakeup_thread(conf->mddev->thread);
}
-/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
- * to process
+/* fetch_block5 - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill5 to continue
*/
-static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
- struct stripe_head_state *s, int disk_idx, int disks)
+static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *failed_dev = &sh->dev[s->failed_num];
- /* don't schedule compute operations or reads on the parity block while
- * a check is in flight
- */
- if ((disk_idx == sh->pd_idx) &&
- test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
- return ~0;
-
/* is the data in this block needed, and can we get it? */
if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
- (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding || (s->failed &&
- (failed_dev->toread || (failed_dev->towrite &&
- !test_bit(R5_OVERWRITE, &failed_dev->flags)
- ))))) {
- /* 1/ We would like to get this block, possibly by computing it,
- * but we might not be able to.
- *
- * 2/ Since parity check operations potentially make the parity
- * block !uptodate it will need to be refreshed before any
- * compute operations on data disks are scheduled.
- *
- * 3/ We hold off parity block re-reads until check operations
- * have quiesced.
+ !test_bit(R5_UPTODATE, &dev->flags) &&
+ (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+ s->syncing || s->expanding ||
+ (s->failed &&
+ (failed_dev->toread ||
+ (failed_dev->towrite &&
+ !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
+ /* We would like to get this block, possibly by computing it,
+ * otherwise read it if the backing disk is insync
*/
if ((s->uptodate == disks - 1) &&
- !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
- set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+ (s->failed && disk_idx == s->failed_num)) {
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute, &dev->flags);
sh->ops.target = disk_idx;
s->req_compute = 1;
- sh->ops.count++;
/* Careful: from this point on 'uptodate' is in the eye
* of raid5_run_ops which services 'compute' operations
* before writes. R5_Wantcompute flags a block that will
@@ -2005,58 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
* subsequent operation.
*/
s->uptodate++;
- return 0; /* uptodate + compute == disks */
- } else if ((s->uptodate < disks - 1) &&
- test_bit(R5_Insync, &dev->flags)) {
- /* Note: we hold off compute operations while checks are
- * in flight, but we still prefer 'compute' over 'read'
- * hence we only read if (uptodate < * disks-1)
- */
+ return 1; /* uptodate + compute == disks */
+ } else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
- if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
s->locked++;
pr_debug("Reading block %d (sync=%d)\n", disk_idx,
s->syncing);
}
}
- return ~0;
+ return 0;
}
-static void handle_issuing_new_read_requests5(struct stripe_head *sh,
+/**
+ * handle_stripe_fill5 - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill5(struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
int i;
- /* Clear completed compute operations. Parity recovery
- * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
- * later on in this routine
- */
- if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
- !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
- clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
- clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
- clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
- }
-
/* look for blocks to read/compute, skip this if a compute
* is already in flight, or if the stripe contents are in the
* midst of changing due to a write
*/
- if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
- !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
- !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+ !sh->reconstruct_state)
for (i = disks; i--; )
- if (__handle_issuing_new_read_requests5(
- sh, s, i, disks) == 0)
+ if (fetch_block5(sh, s, i, disks))
break;
- }
set_bit(STRIPE_HANDLE, &sh->state);
}
-static void handle_issuing_new_read_requests6(struct stripe_head *sh,
+static void handle_stripe_fill6(struct stripe_head *sh,
struct stripe_head_state *s, struct r6_state *r6s,
int disks)
{
@@ -2077,7 +1969,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
/* we would like to get this block, possibly
* by computing it, but we might not be able to
*/
- if (s->uptodate == disks-1) {
+ if ((s->uptodate == disks - 1) &&
+ (s->failed && (i == r6s->failed_num[0] ||
+ i == r6s->failed_num[1]))) {
pr_debug("Computing stripe %llu block %d\n",
(unsigned long long)sh->sector, i);
compute_block_1(sh, i, 0);
@@ -2113,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
}
-/* handle_completed_write_requests
+/* handle_stripe_clean_event
* any written block on an uptodate or failed drive can be returned.
* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
* never LOCKED, so we don't need to test 'failed' directly.
*/
-static void handle_completed_write_requests(raid5_conf_t *conf,
+static void handle_stripe_clean_event(raid5_conf_t *conf,
struct stripe_head *sh, int disks, struct bio **return_bi)
{
int i;
@@ -2163,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
md_wakeup_thread(conf->mddev->thread);
}
-static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
+static void handle_stripe_dirtying5(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s, int disks)
{
int rmw = 0, rcw = 0, i;
@@ -2207,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
"%d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
- if (!test_and_set_bit(
- STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
s->locked++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
@@ -2233,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
"%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
- if (!test_and_set_bit(
- STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
s->locked++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
@@ -2253,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
* simultaneously. If this is not the case then new writes need to be
* held off until the compute completes.
*/
- if ((s->req_compute ||
- !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
- (s->locked == 0 && (rcw == 0 || rmw == 0) &&
- !test_bit(STRIPE_BIT_DELAY, &sh->state)))
- s->locked += handle_write_operations5(sh, rcw == 0, 0);
+ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+ (s->locked == 0 && (rcw == 0 || rmw == 0) &&
+ !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+ schedule_reconstruction5(sh, s, rcw == 0, 0);
}
-static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
+static void handle_stripe_dirtying6(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s,
struct r6_state *r6s, int disks)
{
@@ -2363,91 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
- int canceled_check = 0;
+ struct r5dev *dev = NULL;
set_bit(STRIPE_HANDLE, &sh->state);
- /* complete a check operation */
- if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
- clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
- clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+ switch (sh->check_state) {
+ case check_state_idle:
+ /* start a new check operation if there are no failures */
if (s->failed == 0) {
- if (sh->ops.zero_sum_result == 0)
- /* parity is correct (on disc,
- * not in buffer any more)
- */
- set_bit(STRIPE_INSYNC, &sh->state);
- else {
- conf->mddev->resync_mismatches +=
- STRIPE_SECTORS;
- if (test_bit(
- MD_RECOVERY_CHECK, &conf->mddev->recovery))
- /* don't try to repair!! */
- set_bit(STRIPE_INSYNC, &sh->state);
- else {
- set_bit(STRIPE_OP_COMPUTE_BLK,
- &sh->ops.pending);
- set_bit(STRIPE_OP_MOD_REPAIR_PD,
- &sh->ops.pending);
- set_bit(R5_Wantcompute,
- &sh->dev[sh->pd_idx].flags);
- sh->ops.target = sh->pd_idx;
- sh->ops.count++;
- s->uptodate++;
- }
- }
- } else
- canceled_check = 1; /* STRIPE_INSYNC is not set */
- }
-
- /* check if we can clear a parity disk reconstruct */
- if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
- test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-
- clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
- clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
- clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
- clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
- }
-
- /* start a new check operation if there are no failures, the stripe is
- * not insync, and a repair is not in flight
- */
- if (s->failed == 0 &&
- !test_bit(STRIPE_INSYNC, &sh->state) &&
- !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
- if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
BUG_ON(s->uptodate != disks);
+ sh->check_state = check_state_run;
+ set_bit(STRIPE_OP_CHECK, &s->ops_request);
clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
- sh->ops.count++;
s->uptodate--;
+ break;
}
- }
+ dev = &sh->dev[s->failed_num];
+ /* fall through */
+ case check_state_compute_result:
+ sh->check_state = check_state_idle;
+ if (!dev)
+ dev = &sh->dev[sh->pd_idx];
+
+ /* check that a write has not made the stripe insync */
+ if (test_bit(STRIPE_INSYNC, &sh->state))
+ break;
- /* Wait for check parity and compute block operations to complete
- * before write-back. If a failure occurred while the check operation
- * was in flight we need to cycle this stripe through handle_stripe
- * since the parity block may not be uptodate
- */
- if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
- !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
- !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
- struct r5dev *dev;
/* either failed parity check, or recovery is happening */
- if (s->failed == 0)
- s->failed_num = sh->pd_idx;
- dev = &sh->dev[s->failed_num];
BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
BUG_ON(s->uptodate != disks);
set_bit(R5_LOCKED, &dev->flags);
+ s->locked++;
set_bit(R5_Wantwrite, &dev->flags);
- if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
clear_bit(STRIPE_DEGRADED, &sh->state);
- s->locked++;
set_bit(STRIPE_INSYNC, &sh->state);
+ break;
+ case check_state_run:
+ break; /* we will be called again upon completion */
+ case check_state_check_result:
+ sh->check_state = check_state_idle;
+
+ /* if a failure occurred during the check operation, leave
+ * STRIPE_INSYNC not set and let the stripe be handled again
+ */
+ if (s->failed)
+ break;
+
+ /* handle a successful check operation, if parity is correct
+ * we are done. Otherwise update the mismatch count and repair
+ * parity if !MD_RECOVERY_CHECK
+ */
+ if (sh->ops.zero_sum_result == 0)
+ /* parity is correct (on disc,
+ * not in buffer any more)
+ */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ conf->mddev->resync_mismatches += STRIPE_SECTORS;
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ /* don't try to repair!! */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ sh->check_state = check_state_compute_run;
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ set_bit(R5_Wantcompute,
+ &sh->dev[sh->pd_idx].flags);
+ sh->ops.target = sh->pd_idx;
+ s->uptodate++;
+ }
+ }
+ break;
+ case check_state_compute_run:
+ break;
+ default:
+ printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+ __func__, sh->check_state,
+ (unsigned long long) sh->sector);
+ BUG();
}
}
@@ -2632,14 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
struct bio *return_bi = NULL;
struct stripe_head_state s;
struct r5dev *dev;
- unsigned long pending = 0;
mdk_rdev_t *blocked_rdev = NULL;
+ int prexor;
memset(&s, 0, sizeof(s));
- pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
- "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
- atomic_read(&sh->count), sh->pd_idx,
- sh->ops.pending, sh->ops.ack, sh->ops.complete);
+ pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
+ "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
+ atomic_read(&sh->count), sh->pd_idx, sh->check_state,
+ sh->reconstruct_state);
spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2648,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
- /* Now to look around and see what can be done */
-
- /* clean-up completed biofill operations */
- if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
- clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
- clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
- clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
- }
+ /* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
@@ -2670,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
/* maybe we can request a biofill operation
*
* new wantfill requests are only permitted while
- * STRIPE_OP_BIOFILL is clear
+ * ops_complete_biofill is guaranteed to be inactive
*/
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
- !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+ !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
set_bit(R5_Wantfill, &dev->flags);
/* now count some things */
@@ -2717,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
goto unlock;
}
- if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
- sh->ops.count++;
+ if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+ set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+ set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+ }
pr_debug("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d\n",
@@ -2728,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
* need to be failed
*/
if (s.failed > 1 && s.to_read+s.to_write+s.written)
- handle_requests_to_failed_array(conf, sh, &s, disks,
- &return_bi);
+ handle_failed_stripe(conf, sh, &s, disks, &return_bi);
if (s.failed > 1 && s.syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2745,46 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
!test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags)) ||
(s.failed == 1 && s.failed_num == sh->pd_idx)))
- handle_completed_write_requests(conf, sh, disks, &return_bi);
+ handle_stripe_clean_event(conf, sh, disks, &return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite ||
- (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
- test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
- handle_issuing_new_read_requests5(sh, &s, disks);
+ (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
+ handle_stripe_fill5(sh, &s, disks);
/* Now we check to see if any write operations have recently
* completed
*/
-
- /* leave prexor set until postxor is done, allows us to distinguish
- * a rmw from a rcw during biodrain
- */
- if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
- test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
- clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
- clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
- clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-
- for (i = disks; i--; )
- clear_bit(R5_Wantprexor, &sh->dev[i].flags);
- }
-
- /* if only POSTXOR is set then this is an 'expand' postxor */
- if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
- test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
- clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
- clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
- clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-
- clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
- clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
- clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+ prexor = 0;
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+ prexor = 1;
+ if (sh->reconstruct_state == reconstruct_state_drain_result ||
+ sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
+ sh->reconstruct_state = reconstruct_state_idle;
/* All the 'written' buffers and the parity block are ready to
* be written back to disk
@@ -2796,9 +2651,8 @@ static void handle_stripe5(struct stripe_head *sh)
(i == sh->pd_idx || dev->written)) {
pr_debug("Writing block %d\n", i);
set_bit(R5_Wantwrite, &dev->flags);
- if (!test_and_set_bit(
- STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
+ if (prexor)
+ continue;
if (!test_bit(R5_Insync, &dev->flags) ||
(i == sh->pd_idx && s.failed == 0))
set_bit(STRIPE_INSYNC, &sh->state);
@@ -2818,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
* 2/ A 'check' operation is in flight, as it may clobber the parity
* block.
*/
- if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
- !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
- handle_issuing_new_write_requests5(conf, sh, &s, disks);
+ if (s.to_write && !sh->reconstruct_state && !sh->check_state)
+ handle_stripe_dirtying5(conf, sh, &s, disks);
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
* data is available. The parity check is held off while parity
* dependent operations are in flight.
*/
- if ((s.syncing && s.locked == 0 &&
- !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
- !test_bit(STRIPE_INSYNC, &sh->state)) ||
- test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
- test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
+ if (sh->check_state ||
+ (s.syncing && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+ !test_bit(STRIPE_INSYNC, &sh->state)))
handle_parity_checks5(conf, sh, &s, disks);
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2850,49 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[s.failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
- if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
- if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
}
}
- /* Finish postxor operations initiated by the expansion
- * process
- */
- if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
- !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
-
+ /* Finish reconstruct operations initiated by the expansion process */
+ if (sh->reconstruct_state == reconstruct_state_result) {
+ sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
-
- clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
- clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
- clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
- for (i = conf->raid_disks; i--; ) {
+ for (i = conf->raid_disks; i--; )
set_bit(R5_Wantwrite, &sh->dev[i].flags);
- if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
- sh->ops.count++;
- }
+ set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
}
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
- !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+ !sh->reconstruct_state) {
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
conf->raid_disks);
- s.locked += handle_write_operations5(sh, 1, 1);
- } else if (s.expanded &&
- !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+ schedule_reconstruction5(sh, &s, 1, 1);
+ } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
@@ -2900,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
}
if (s.expanding && s.locked == 0 &&
- !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh, NULL);
- if (sh->ops.count)
- pending = get_stripe_work(sh);
-
unlock:
spin_unlock(&sh->lock);
@@ -2913,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
- if (pending)
- raid5_run_ops(sh, pending);
+ if (s.ops_request)
+ raid5_run_ops(sh, s.ops_request);
- return_io(return_bi);
+ ops_run_io(sh, &s);
+ return_io(return_bi);
}
static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -3025,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
* might need to be failed
*/
if (s.failed > 2 && s.to_read+s.to_write+s.written)
- handle_requests_to_failed_array(conf, sh, &s, disks,
- &return_bi);
+ handle_failed_stripe(conf, sh, &s, disks, &return_bi);
if (s.failed > 2 && s.syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3051,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& test_bit(R5_UPTODATE, &qdev->flags)))))
- handle_completed_write_requests(conf, sh, disks, &return_bi);
+ handle_stripe_clean_event(conf, sh, disks, &return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
@@ -3059,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
*/
if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
(s.syncing && (s.uptodate < disks)) || s.expanding)
- handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
+ handle_stripe_fill6(sh, &s, &r6s, disks);
/* now to consider writing and what else, if anything should be read */
if (s.to_write)
- handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
+ handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
@@ -3119,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
}
if (s.expanding && s.locked == 0 &&
- !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh, &r6s);
unlock:
@@ -3129,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
- return_io(return_bi);
-
- for (i=disks; i-- ;) {
- int rw;
- struct bio *bi;
- mdk_rdev_t *rdev;
- if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
- rw = WRITE;
- else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
- rw = READ;
- else
- continue;
-
- set_bit(STRIPE_IO_STARTED, &sh->state);
-
- bi = &sh->dev[i].req;
+ ops_run_io(sh, &s);
- bi->bi_rw = rw;
- if (rw == WRITE)
- bi->bi_end_io = raid5_end_write_request;
- else
- bi->bi_end_io = raid5_end_read_request;
-
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
- if (rdev && test_bit(Faulty, &rdev->flags))
- rdev = NULL;
- if (rdev)
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- if (rdev) {
- if (s.syncing || s.expanding || s.expanded)
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
- bi->bi_bdev = rdev->bdev;
- pr_debug("for %llu schedule op %ld on disc %d\n",
- (unsigned long long)sh->sector, bi->bi_rw, i);
- atomic_inc(&sh->count);
- bi->bi_sector = sh->sector + rdev->data_offset;
- bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_vcnt = 1;
- bi->bi_max_vecs = 1;
- bi->bi_idx = 0;
- bi->bi_io_vec = &sh->dev[i].vec;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
- bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_size = STRIPE_SIZE;
- bi->bi_next = NULL;
- if (rw == WRITE &&
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
- generic_make_request(bi);
- } else {
- if (rw == WRITE)
- set_bit(STRIPE_DEGRADED, &sh->state);
- pr_debug("skip op %ld on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
- }
+ return_io(return_bi);
}
static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
@@ -3297,15 +3073,17 @@ static int raid5_congested(void *data, int bits)
/* We want read requests to align with chunks where possible,
* but write requests don't need to.
*/
-static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid5_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
{
mddev_t *mddev = q->queuedata;
- sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_size >> 9;
- unsigned int bio_sectors = bio->bi_size >> 9;
+ unsigned int bio_sectors = bvm->bi_size >> 9;
- if (bio_data_dir(bio) == WRITE)
+ if ((bvm->bi_rw & 1) == WRITE)
return biovec->bv_len; /* always allow writes to be mergeable */
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
@@ -3678,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
- bi->bi_end_io(bi,
- test_bit(BIO_UPTODATE, &bi->bi_flags)
- ? 0 : -EIO);
+ bio_endio(bi, 0);
}
return 0;
}
@@ -3766,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
j == raid6_next_disk(sh->pd_idx, sh->disks))
continue;
s = compute_blocknr(sh, j);
- if (s < (mddev->array_size<<1)) {
+ if (s < mddev->array_sectors) {
skipped = 1;
continue;
}
@@ -3983,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
spin_lock_irq(&conf->device_lock);
remaining = --raid_bio->bi_phys_segments;
spin_unlock_irq(&conf->device_lock);
- if (remaining == 0) {
-
- raid_bio->bi_end_io(raid_bio,
- test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
- ? 0 : -EIO);
- }
+ if (remaining == 0)
+ bio_endio(raid_bio, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
return handled;
@@ -4075,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
if (!conf)
@@ -4090,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
else
break;
}
- md_allow_write(mddev);
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
while (new > conf->max_nr_stripes) {
if (grow_one_stripe(conf))
conf->max_nr_stripes++;
@@ -4256,6 +4032,7 @@ static int run(mddev_t *mddev)
goto abort;
}
spin_lock_init(&conf->device_lock);
+ mddev->queue->queue_lock = &conf->device_lock;
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
@@ -4285,7 +4062,9 @@ static int run(mddev_t *mddev)
" disk %d\n", bdevname(rdev->bdev,b),
raid_disk);
working_disks++;
- }
+ } else
+ /* Cannot rely on bitmap to complete recovery */
+ conf->fullsync = 1;
}
/*
@@ -4412,7 +4191,7 @@ static int run(mddev_t *mddev)
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested;
- mddev->array_size = mddev->size * (conf->previous_raid_disks -
+ mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
@@ -4562,6 +4341,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
err = -EBUSY;
goto abort;
}
+ /* Only remove non-faulty devices if recovery
+ * isn't possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->degraded <= conf->max_degraded) {
+ err = -EBUSY;
+ goto abort;
+ }
p->rdev = NULL;
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
@@ -4579,35 +4366,41 @@ abort:
static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
raid5_conf_t *conf = mddev->private;
- int found = 0;
+ int err = -EEXIST;
int disk;
struct disk_info *p;
+ int first = 0;
+ int last = conf->raid_disks - 1;
if (mddev->degraded > conf->max_degraded)
/* no point adding a device */
- return 0;
+ return -EINVAL;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
/*
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
if (rdev->saved_raid_disk >= 0 &&
+ rdev->saved_raid_disk >= first &&
conf->disks[rdev->saved_raid_disk].rdev == NULL)
disk = rdev->saved_raid_disk;
else
- disk = 0;
- for ( ; disk < conf->raid_disks; disk++)
+ disk = first;
+ for ( ; disk <= last ; disk++)
if ((p=conf->disks + disk)->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
- found = 1;
+ err = 0;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
break;
}
print_raid5_conf(conf);
- return found;
+ return err;
}
static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4622,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
raid5_conf_t *conf = mddev_to_conf(mddev);
sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
- mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
- set_capacity(mddev->gendisk, mddev->array_size << 1);
+ mddev->array_sectors = sectors * (mddev->raid_disks
+ - conf->max_degraded);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
@@ -4708,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev)
rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
- if (raid5_add_disk(mddev, rdev)) {
+ if (raid5_add_disk(mddev, rdev) == 0) {
char nm[20];
set_bit(In_sync, &rdev->flags);
added_devices++;
@@ -4756,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf)
struct block_device *bdev;
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- conf->mddev->array_size = conf->mddev->size *
+ conf->mddev->array_sectors = 2 * conf->mddev->size *
(conf->raid_disks - conf->max_degraded);
- set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+ set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
+ i_size_write(bdev->bd_inode,
+ (loff_t)conf->mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}