summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig24
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c12
-rw-r--r--drivers/md/dm-crypt.c618
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-ioctl.c111
-rw-r--r--drivers/md/dm-kcopyd.c57
-rw-r--r--drivers/md/dm-log-userspace-base.c139
-rw-r--r--drivers/md/dm-log-userspace-transfer.c1
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c67
-rw-r--r--drivers/md/dm-raid.c697
-rw-r--r--drivers/md/dm-raid1.c19
-rw-r--r--drivers/md/dm-snap-persistent.c4
-rw-r--r--drivers/md/dm-snap.c62
-rw-r--r--drivers/md/dm-stripe.c27
-rw-r--r--drivers/md/dm-table.c20
-rw-r--r--drivers/md/dm.c23
-rw-r--r--drivers/md/md.c245
-rw-r--r--drivers/md/md.h15
-rw-r--r--drivers/md/raid0.c40
-rw-r--r--drivers/md/raid1.c33
-rw-r--r--drivers/md/raid10.c23
-rw-r--r--drivers/md/raid5.c62
24 files changed, 1854 insertions, 450 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e3155..98d9ec85e0e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'.
+config DM_RAID
+ tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ select MD_RAID456
+ select BLK_DEV_MD
+ ---help---
+ A dm target that supports RAID4, RAID5 and RAID6 mappings
+
+ A RAID-5 set of N drives with a capacity of C MB per drive provides
+ the capacity of C * (N - 1) MB, and protects against a failure
+ of a single drive. For a given sector (row) number, (N - 1) drives
+ contain data sectors, and one drive contains the parity protection.
+ For a RAID-4 set, the parity blocks are present on a single drive,
+ while a RAID-5 set distributes the parity across the drives in one
+ of the available parity distribution methods.
+
+ A RAID-6 set of N drives with a capacity of C MB per drive
+ provides the capacity of C * (N - 2) MB, and protects
+ against a failure of any two drives. For a given sector
+ (row) number, (N - 2) drives contain data sectors, and two
+ drives contains two independent redundancy syndromes. Like
+ RAID-5, RAID-6 distributes the syndromes across the drives
+ in one of the available parity distribution methods.
+
config DM_LOG_USERSPACE
tristate "Mirror userspace logging (EXPERIMENTAL)"
depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919..d0138606c2e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
+obj-$(CONFIG_DM_RAID) += dm-raid.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a1ffe3527a..9a35320fb59 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
|| test_bit(Faulty, &rdev->flags))
continue;
- target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
+ target = offset + index * (PAGE_SIZE/512);
if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
- page, READ)) {
+ page, READ, true)) {
page->index = index;
attach_page_buffers(page, NULL); /* so that free_buffer will
* quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev = NULL;
+ struct block_device *bdev;
mddev_t *mddev = bitmap->mddev;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE;
loff_t offset = mddev->bitmap_info.offset;
+
+ bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+
if (page->index == bitmap->file_pages-1)
size = roundup(bitmap->last_page_size,
- bdev_logical_block_size(rdev->bdev));
+ bdev_logical_block_size(bdev));
/* Just make sure we aren't corrupting data or
* metadata
*/
@@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
wait_event(bitmap->mddev->recovery_wait,
atomic_read(&bitmap->mddev->recovery_active) == 0);
- bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+ bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e70..4e054bd9166 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
#include <linux/crypto.h>
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
+#include <linux/percpu.h>
#include <asm/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
#include <linux/device-mapper.h>
@@ -63,6 +67,7 @@ struct dm_crypt_request {
struct convert_context *ctx;
struct scatterlist sg_in;
struct scatterlist sg_out;
+ sector_t iv_sector;
};
struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
void (*dtr)(struct crypt_config *cc);
int (*init)(struct crypt_config *cc);
int (*wipe)(struct crypt_config *cc);
- int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+ int (*generator)(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq);
+ int (*post)(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq);
};
struct iv_essiv_private {
- struct crypto_cipher *tfm;
struct crypto_hash *hash_tfm;
u8 *salt;
};
@@ -86,11 +93,32 @@ struct iv_benbi_private {
int shift;
};
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+ struct crypto_shash *hash_tfm;
+ u8 *seed;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+ struct ablkcipher_request *req;
+ /* ESSIV: struct crypto_cipher *essiv_tfm */
+ void *iv_private;
+ struct crypto_ablkcipher *tfms[0];
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
struct crypt_config {
struct dm_dev *dev;
sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
struct workqueue_struct *crypt_queue;
char *cipher;
- char *cipher_mode;
+ char *cipher_string;
struct crypt_iv_operations *iv_gen_ops;
union {
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
+ struct iv_lmk_private lmk;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
/*
+ * Duplicated per cpu state. Access through
+ * per_cpu_ptr() only.
+ */
+ struct crypt_cpu __percpu *cpu;
+ unsigned tfms_count;
+
+ /*
* Layout of each crypto request:
*
* struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
* correctly aligned.
*/
unsigned int dmreq_start;
- struct ablkcipher_request *req;
- struct crypto_ablkcipher *tfm;
unsigned long flags;
unsigned int key_size;
+ unsigned int key_parts;
u8 key[0];
};
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+ return this_cpu_ptr(cc->cpu);
+}
+
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+ return __this_cpu_ptr(cc->cpu)->tfms[0];
+}
/*
* Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
* null: the initial vector is always zero. Provides compatibility with
* obsolete loop_fish2 devices. Do not use for new devices.
*
+ * lmk: Compatible implementation of the block chaining mode used
+ * by the Loop-AES block device encryption system
+ * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ * It operates on full 512 byte sectors and uses CBC
+ * with an IV derived from the sector number, the data and
+ * optionally extra IV seed.
+ * This means that after decryption the first block
+ * of sector must be tweaked according to decrypted data.
+ * Loop-AES can use three encryption schemes:
+ * version 1: is plain aes-cbc mode
+ * version 2: uses 64 multikey scheme with lmk IV generator
+ * version 3: the same as version 2 with additional IV seed
+ * (it uses 65 keys, last key is used as IV seed)
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
- *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+ *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
return 0;
}
static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
- sector_t sector)
+ struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
- *(u64 *)iv = cpu_to_le64(sector);
+ *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
return 0;
}
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
struct hash_desc desc;
struct scatterlist sg;
- int err;
+ struct crypto_cipher *essiv_tfm;
+ int err, cpu;
sg_init_one(&sg, cc->key, cc->key_size);
desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
if (err)
return err;
- return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+ for_each_possible_cpu(cpu) {
+ essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+
+ err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
crypto_hash_digestsize(essiv->hash_tfm));
+ if (err)
+ return err;
+ }
+
+ return 0;
}
/* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+ struct crypto_cipher *essiv_tfm;
+ int cpu, r, err = 0;
memset(essiv->salt, 0, salt_size);
- return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+ for_each_possible_cpu(cpu) {
+ essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+ r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+ if (r)
+ err = r;
+ }
+
+ return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+ struct dm_target *ti,
+ u8 *salt, unsigned saltsize)
+{
+ struct crypto_cipher *essiv_tfm;
+ int err;
+
+ /* Setup the essiv_tfm with the given salt */
+ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(essiv_tfm)) {
+ ti->error = "Error allocating crypto tfm for ESSIV";
+ return essiv_tfm;
+ }
+
+ if (crypto_cipher_blocksize(essiv_tfm) !=
+ crypto_ablkcipher_ivsize(any_tfm(cc))) {
+ ti->error = "Block size of ESSIV cipher does "
+ "not match IV size of block cipher";
+ crypto_free_cipher(essiv_tfm);
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+ if (err) {
+ ti->error = "Failed to set key for ESSIV cipher";
+ crypto_free_cipher(essiv_tfm);
+ return ERR_PTR(err);
+ }
+
+ return essiv_tfm;
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
+ int cpu;
+ struct crypt_cpu *cpu_cc;
+ struct crypto_cipher *essiv_tfm;
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
- crypto_free_cipher(essiv->tfm);
- essiv->tfm = NULL;
-
crypto_free_hash(essiv->hash_tfm);
essiv->hash_tfm = NULL;
kzfree(essiv->salt);
essiv->salt = NULL;
+
+ for_each_possible_cpu(cpu) {
+ cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ essiv_tfm = cpu_cc->iv_private;
+
+ if (essiv_tfm)
+ crypto_free_cipher(essiv_tfm);
+
+ cpu_cc->iv_private = NULL;
+ }
}
static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
struct crypto_cipher *essiv_tfm = NULL;
struct crypto_hash *hash_tfm = NULL;
u8 *salt = NULL;
- int err;
+ int err, cpu;
if (!opts) {
ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
goto bad;
}
- /* Allocate essiv_tfm */
- essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(essiv_tfm)) {
- ti->error = "Error allocating crypto tfm for ESSIV";
- err = PTR_ERR(essiv_tfm);
- goto bad;
- }
- if (crypto_cipher_blocksize(essiv_tfm) !=
- crypto_ablkcipher_ivsize(cc->tfm)) {
- ti->error = "Block size of ESSIV cipher does "
- "not match IV size of block cipher";
- err = -EINVAL;
- goto bad;
- }
-
cc->iv_gen_private.essiv.salt = salt;
- cc->iv_gen_private.essiv.tfm = essiv_tfm;
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+ for_each_possible_cpu(cpu) {
+ essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+ crypto_hash_digestsize(hash_tfm));
+ if (IS_ERR(essiv_tfm)) {
+ crypt_iv_essiv_dtr(cc);
+ return PTR_ERR(essiv_tfm);
+ }
+ per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+ }
+
return 0;
bad:
- if (essiv_tfm && !IS_ERR(essiv_tfm))
- crypto_free_cipher(essiv_tfm);
if (hash_tfm && !IS_ERR(hash_tfm))
crypto_free_hash(hash_tfm);
kfree(salt);
return err;
}
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
+ struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
+
memset(iv, 0, cc->iv_size);
- *(u64 *)iv = cpu_to_le64(sector);
- crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+ *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+ crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+
return 0;
}
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+ unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
int log = ilog2(bs);
/* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
{
}
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
__be64 val;
memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
- val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+ val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
return 0;
}
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
return 0;
}
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+ crypto_free_shash(lmk->hash_tfm);
+ lmk->hash_tfm = NULL;
+
+ kzfree(lmk->seed);
+ lmk->seed = NULL;
+}
+
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(lmk->hash_tfm)) {
+ ti->error = "Error initializing LMK hash";
+ return PTR_ERR(lmk->hash_tfm);
+ }
+
+ /* No seed in LMK version 2 */
+ if (cc->key_parts == cc->tfms_count) {
+ lmk->seed = NULL;
+ return 0;
+ }
+
+ lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+ if (!lmk->seed) {
+ crypt_iv_lmk_dtr(cc);
+ ti->error = "Error kmallocing seed storage in LMK";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ int subkey_size = cc->key_size / cc->key_parts;
+
+ /* LMK seed is on the position of LMK_KEYS + 1 key */
+ if (lmk->seed)
+ memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+ crypto_shash_digestsize(lmk->hash_tfm));
+
+ return 0;
+}
+
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ if (lmk->seed)
+ memset(lmk->seed, 0, LMK_SEED_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq,
+ u8 *data)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ struct {
+ struct shash_desc desc;
+ char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+ } sdesc;
+ struct md5_state md5state;
+ u32 buf[4];
+ int i, r;
+
+ sdesc.desc.tfm = lmk->hash_tfm;
+ sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(&sdesc.desc);
+ if (r)
+ return r;
+
+ if (lmk->seed) {
+ r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+ if (r)
+ return r;
+ }
+
+ /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+ r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+ if (r)
+ return r;
+
+ /* Sector is cropped to 56 bits here */
+ buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+ buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+ buf[2] = cpu_to_le32(4024);
+ buf[3] = 0;
+ r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+ if (r)
+ return r;
+
+ /* No MD5 padding here */
+ r = crypto_shash_export(&sdesc.desc, &md5state);
+ if (r)
+ return r;
+
+ for (i = 0; i < MD5_HASH_WORDS; i++)
+ __cpu_to_le32s(&md5state.hash[i]);
+ memcpy(iv, &md5state.hash, cc->iv_size);
+
+ return 0;
+}
+
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *src;
+ int r = 0;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+ r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+ kunmap_atomic(src, KM_USER0);
+ } else
+ memset(iv, 0, cc->iv_size);
+
+ return r;
+}
+
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *dst;
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+ return 0;
+
+ dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+ r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+
+ /* Tweak the first block of plaintext sector */
+ if (!r)
+ crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+
+ kunmap_atomic(dst, KM_USER0);
+ return r;
+}
+
static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
.generator = crypt_iv_null_gen
};
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+ .ctr = crypt_iv_lmk_ctr,
+ .dtr = crypt_iv_lmk_dtr,
+ .init = crypt_iv_lmk_init,
+ .wipe = crypt_iv_lmk_wipe,
+ .generator = crypt_iv_lmk_gen,
+ .post = crypt_iv_lmk_post
+};
+
static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
}
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
+
static int crypt_convert_block(struct crypt_config *cc,
struct convert_context *ctx,
struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
int r = 0;
dmreq = dmreq_of_req(cc, req);
- iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
- crypto_ablkcipher_alignmask(cc->tfm) + 1);
+ iv = iv_of_dmreq(cc, dmreq);
+ dmreq->iv_sector = ctx->sector;
dmreq->ctx = ctx;
sg_init_table(&dmreq->sg_in, 1);
sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
}
if (cc->iv_gen_ops) {
- r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+ r = cc->iv_gen_ops->generator(cc, iv, dmreq);
if (r < 0)
return r;
}
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
else
r = crypto_ablkcipher_decrypt(req);
+ if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ r = cc->iv_gen_ops->post(cc, iv, dmreq);
+
return r;
}
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error);
+
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
- if (!cc->req)
- cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
- ablkcipher_request_set_tfm(cc->req, cc->tfm);
- ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
- CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done,
- dmreq_of_req(cc, cc->req));
+ struct crypt_cpu *this_cc = this_crypt_config(cc);
+ unsigned key_index = ctx->sector & (cc->tfms_count - 1);
+
+ if (!this_cc->req)
+ this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+ ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+ ablkcipher_request_set_callback(this_cc->req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
}
/*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
+ struct crypt_cpu *this_cc = this_crypt_config(cc);
int r;
atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->pending);
- r = crypt_convert_block(cc, ctx, cc->req);
+ r = crypt_convert_block(cc, ctx, this_cc->req);
switch (r) {
/* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
- cc->req = NULL;
+ this_cc->req = NULL;
ctx->sector++;
continue;
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
* They must be separated as otherwise the final stages could be
* starved by new requests which can block in the first stages due
* to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
*/
static void crypt_endio(struct bio *clone, int error)
{
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_destructor = dm_crypt_bio_destructor;
}
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+ blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
{
struct crypt_config *cc = io->target->private;
struct bio *base_bio = io->base_bio;
struct bio *clone;
- crypt_inc_pending(io);
-
/*
* The block layer might modify the bvec array, so always
* copy the required bvecs because we need the original
* one in order to decrypt the whole bio data *afterwards*.
*/
- clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
- if (unlikely(!clone)) {
- io->error = -ENOMEM;
- crypt_dec_pending(io);
- return;
+ clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
+ if (!clone) {
+ kcryptd_unplug(cc);
+ return 1;
}
+ crypt_inc_pending(io);
+
clone_init(io, clone);
clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
sizeof(struct bio_vec) * clone->bi_vcnt);
generic_make_request(clone);
+ return 0;
}
static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
{
struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
- if (bio_data_dir(io->base_bio) == READ)
- kcryptd_io_read(io);
- else
+ if (bio_data_dir(io->base_bio) == READ) {
+ crypt_inc_pending(io);
+ if (kcryptd_io_read(io, GFP_NOIO))
+ io->error = -ENOMEM;
+ crypt_dec_pending(io);
+ } else
kcryptd_io_write(io);
}
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
return;
}
+ if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+
mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
}
}
-static int crypt_set_key(struct crypt_config *cc, char *key)
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
{
- unsigned key_size = strlen(key) >> 1;
+ struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ unsigned i;
- if (cc->key_size && cc->key_size != key_size)
+ for (i = 0; i < cc->tfms_count; i++)
+ if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+ crypto_free_ablkcipher(cpu_cc->tfms[i]);
+ cpu_cc->tfms[i] = NULL;
+ }
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+ struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ unsigned i;
+ int err;
+
+ for (i = 0; i < cc->tfms_count; i++) {
+ cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+ if (IS_ERR(cpu_cc->tfms[i])) {
+ err = PTR_ERR(cpu_cc->tfms[i]);
+ crypt_free_tfms(cc, cpu);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+ unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+ int cpu, err = 0, i, r;
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < cc->tfms_count; i++) {
+ r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+ cc->key + (i * subkey_size), subkey_size);
+ if (r)
+ err = r;
+ }
+ }
+
+ return err;
+}
+
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+ /* The key size may not be changed. */
+ if (cc->key_size != (strlen(key) >> 1))
return -EINVAL;
- cc->key_size = key_size; /* initial settings */
+ /* Hyphen (which gives a key_size of zero) means there is no key. */
+ if (!cc->key_size && strcmp(key, "-"))
+ return -EINVAL;
- if ((!key_size && strcmp(key, "-")) ||
- (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+ if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
return -EINVAL;
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+ return crypt_setkey_allcpus(cc);
}
static int crypt_wipe_key(struct crypt_config *cc)
{
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
- return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+
+ return crypt_setkey_allcpus(cc);
}
static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = ti->private;
+ struct crypt_cpu *cpu_cc;
+ int cpu;
ti->private = NULL;
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
+ if (cc->cpu)
+ for_each_possible_cpu(cpu) {
+ cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ if (cpu_cc->req)
+ mempool_free(cpu_cc->req, cc->req_pool);
+ crypt_free_tfms(cc, cpu);
+ }
+
if (cc->bs)
bioset_free(cc->bs);
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
- if (cc->tfm && !IS_ERR(cc->tfm))
- crypto_free_ablkcipher(cc->tfm);
-
if (cc->dev)
dm_put_device(ti, cc->dev);
+ if (cc->cpu)
+ free_percpu(cc->cpu);
+
kzfree(cc->cipher);
- kzfree(cc->cipher_mode);
+ kzfree(cc->cipher_string);
/* Must zero key material before freeing */
kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
char *cipher_in, char *key)
{
struct crypt_config *cc = ti->private;
- char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+ char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
char *cipher_api = NULL;
- int ret = -EINVAL;
+ int cpu, ret = -EINVAL;
/* Convert to crypto api definition? */
if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
return -EINVAL;
}
+ cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+ if (!cc->cipher_string)
+ goto bad_mem;
+
/*
* Legacy dm-crypt cipher specification
- * cipher-mode-iv:ivopts
+ * cipher[:keycount]-mode-iv:ivopts
*/
tmp = cipher_in;
- cipher = strsep(&tmp, "-");
+ keycount = strsep(&tmp, "-");
+ cipher = strsep(&keycount, ":");
+
+ if (!keycount)
+ cc->tfms_count = 1;
+ else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+ !is_power_of_2(cc->tfms_count)) {
+ ti->error = "Bad cipher key count specification";
+ return -EINVAL;
+ }
+ cc->key_parts = cc->tfms_count;
cc->cipher = kstrdup(cipher, GFP_KERNEL);
if (!cc->cipher)
goto bad_mem;
- if (tmp) {
- cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
- if (!cc->cipher_mode)
- goto bad_mem;
- }
-
chainmode = strsep(&tmp, "-");
ivopts = strsep(&tmp, "-");
ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
- /* Compatibility mode for old dm-crypt mappings */
+ cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+ cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+ __alignof__(struct crypt_cpu));
+ if (!cc->cpu) {
+ ti->error = "Cannot allocate per cpu state";
+ goto bad_mem;
+ }
+
+ /*
+ * For compatibility with the original dm-crypt mapping format, if
+ * only the cipher name is supplied, use cbc-plain.
+ */
if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
- kfree(cc->cipher_mode);
- cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
chainmode = "cbc";
ivmode = "plain";
}
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
}
/* Allocate cipher */
- cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
- if (IS_ERR(cc->tfm)) {
- ret = PTR_ERR(cc->tfm);
- ti->error = "Error allocating crypto tfm";
- goto bad;
+ for_each_possible_cpu(cpu) {
+ ret = crypt_alloc_tfms(cc, cpu, cipher_api);
+ if (ret < 0) {
+ ti->error = "Error allocating crypto tfm";
+ goto bad;
+ }
}
/* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
}
/* Initialize IV */
- cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+ cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
if (cc->iv_size)
/* at least a 64 bit sector number should fit in our buffer */
cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
cc->iv_gen_ops = &crypt_iv_benbi_ops;
else if (strcmp(ivmode, "null") == 0)
cc->iv_gen_ops = &crypt_iv_null_ops;
- else {
+ else if (strcmp(ivmode, "lmk") == 0) {
+ cc->iv_gen_ops = &crypt_iv_lmk_ops;
+ /* Version 2 and 3 is recognised according
+ * to length of provided multi-key string.
+ * If present (version 3), last key is used as IV seed.
+ */
+ if (cc->key_size % cc->key_parts)
+ cc->key_parts++;
+ } else {
ret = -EINVAL;
ti->error = "Invalid IV mode";
goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Cannot allocate encryption context";
return -ENOMEM;
}
+ cc->key_size = key_size;
ti->private = cc;
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->dmreq_start = sizeof(struct ablkcipher_request);
- cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+ cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+ cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
~(crypto_tfm_ctx_alignment() - 1);
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
- cc->req = NULL;
cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->start = tmpll;
ret = -ENOMEM;
- cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+ cc->io_queue = alloc_workqueue("kcryptd_io",
+ WQ_NON_REENTRANT|
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
- cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_NON_REENTRANT|
+ WQ_CPU_INTENSIVE|
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
- if (bio_data_dir(io->base_bio) == READ)
- kcryptd_queue_io(io);
- else
+ if (bio_data_dir(io->base_bio) == READ) {
+ if (kcryptd_io_read(io, GFP_NOWAIT))
+ kcryptd_queue_io(io);
+ } else
kcryptd_queue_crypt(io);
return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- if (cc->cipher_mode)
- DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
- else
- DMEMIT("%s ", cc->cipher);
+ DMEMIT("%s ", cc->cipher_string);
if (cc->key_size > 0) {
if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 7, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc9..f18375dcedd 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
{
int r = -ENOMEM;
- kdelayd_wq = create_workqueue("kdelayd");
+ kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!kdelayd_wq) {
DMERR("Couldn't start kdelayd");
goto bad_queue;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b415..6d12775a106 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
DMWARN("remove_all left %d open device(s)", dev_skipped);
}
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+ mutex_lock(&dm_hash_cells_mutex);
+ hc->uuid = new_uuid;
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+ char *old_name;
+
+ /*
+ * Rename and move the name cell.
+ */
+ list_del(&hc->name_list);
+ old_name = hc->name;
+
+ mutex_lock(&dm_hash_cells_mutex);
+ hc->name = new_name;
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+ return old_name;
+}
+
static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
const char *new)
{
- char *new_name, *old_name;
+ char *new_data, *old_name = NULL;
struct hash_cell *hc;
struct dm_table *table;
struct mapped_device *md;
+ unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
/*
* duplicate new.
*/
- new_name = kstrdup(new, GFP_KERNEL);
- if (!new_name)
+ new_data = kstrdup(new, GFP_KERNEL);
+ if (!new_data)
return ERR_PTR(-ENOMEM);
down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
/*
* Is new free ?
*/
- hc = __get_name_cell(new);
+ if (change_uuid)
+ hc = __get_uuid_cell(new);
+ else
+ hc = __get_name_cell(new);
+
if (hc) {
- DMWARN("asked to rename to an already-existing name %s -> %s",
+ DMWARN("Unable to change %s on mapped device %s to one that "
+ "already exists: %s",
+ change_uuid ? "uuid" : "name",
param->name, new);
dm_put(hc->md);
up_write(&_hash_lock);
- kfree(new_name);
+ kfree(new_data);
return ERR_PTR(-EBUSY);
}
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
*/
hc = __get_name_cell(param->name);
if (!hc) {
- DMWARN("asked to rename a non-existent device %s -> %s",
- param->name, new);
+ DMWARN("Unable to rename non-existent device, %s to %s%s",
+ param->name, change_uuid ? "uuid " : "", new);
up_write(&_hash_lock);
- kfree(new_name);
+ kfree(new_data);
return ERR_PTR(-ENXIO);
}
/*
- * rename and move the name cell.
+ * Does this device already have a uuid?
*/
- list_del(&hc->name_list);
- old_name = hc->name;
- mutex_lock(&dm_hash_cells_mutex);
- hc->name = new_name;
- mutex_unlock(&dm_hash_cells_mutex);
- list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+ if (change_uuid && hc->uuid) {
+ DMWARN("Unable to change uuid of mapped device %s to %s "
+ "because uuid is already set to %s",
+ param->name, new, hc->uuid);
+ dm_put(hc->md);
+ up_write(&_hash_lock);
+ kfree(new_data);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (change_uuid)
+ __set_cell_uuid(hc, new_data);
+ else
+ old_name = __change_cell_name(hc, new_data);
/*
* Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
hc = __find_device_hash_cell(param);
if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
up_write(&_hash_lock);
return -ENXIO;
}
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
*/
r = dm_lock_for_deletion(md);
if (r) {
- DMWARN("unable to remove open device %s", hc->name);
+ DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
up_write(&_hash_lock);
dm_put(md);
return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
static int dev_rename(struct dm_ioctl *param, size_t param_size)
{
int r;
- char *new_name = (char *) param + param->data_start;
+ char *new_data = (char *) param + param->data_start;
struct mapped_device *md;
+ unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
- if (new_name < param->data ||
- invalid_str(new_name, (void *) param + param_size) ||
- strlen(new_name) > DM_NAME_LEN - 1) {
- DMWARN("Invalid new logical volume name supplied.");
+ if (new_data < param->data ||
+ invalid_str(new_data, (void *) param + param_size) ||
+ strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+ DMWARN("Invalid new mapped device name or uuid string supplied.");
return -EINVAL;
}
- r = check_name(new_name);
- if (r)
- return r;
+ if (!change_uuid) {
+ r = check_name(new_data);
+ if (r)
+ return r;
+ }
- md = dm_hash_rename(param, new_name);
+ md = dm_hash_rename(param, new_data);
if (IS_ERR(md))
return PTR_ERR(md);
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
hc = __find_device_hash_cell(param);
if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
up_write(&_hash_lock);
return -ENXIO;
}
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
hc = __find_device_hash_cell(param);
if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
up_write(&_hash_lock);
return -ENXIO;
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac568..924f5f0084c 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
unsigned int nr_pages;
unsigned int nr_free_pages;
+ /*
+ * Block devices to unplug.
+ * Non-NULL pointer means that a block device has some pending requests
+ * and needs to be unplugged.
+ */
+ struct block_device *unplug[2];
+
struct dm_io_client *io_client;
wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
return 0;
}
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+ if (kc->unplug[rw] != NULL) {
+ blk_unplug(bdev_get_queue(kc->unplug[rw]));
+ kc->unplug[rw] = NULL;
+ }
+}
+
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+ struct block_device *bdev)
+{
+ if (likely(kc->unplug[rw] == bdev))
+ return;
+ unplug(kc, rw);
+ kc->unplug[rw] = bdev;
+}
+
static void complete_io(unsigned long error, void *context)
{
struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
{
int r;
struct dm_io_request io_req = {
- .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+ .bi_rw = job->rw,
.mem.type = DM_IO_PAGE_LIST,
.mem.ptr.pl = job->pages,
.mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
- if (job->rw == READ)
+ if (job->rw == READ) {
r = dm_io(&io_req, 1, &job->source, NULL);
- else
+ prepare_unplug(job->kc, READ, job->source.bdev);
+ } else {
+ if (job->num_dests > 1)
+ io_req.bi_rw |= REQ_UNPLUG;
r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+ if (!(io_req.bi_rw & REQ_UNPLUG))
+ prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+ }
return r;
}
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
* Pages jobs when successful will jump onto the io jobs
* list. io jobs call wake when they complete and it all
* starts again.
+ *
+ * Note that io_jobs add block devices to the unplug array,
+ * this array is cleared with "unplug" calls. It is thus
+ * forbidden to run complete_jobs after io_jobs and before
+ * unplug because the block device could be destroyed in
+ * job completion callback.
*/
process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job);
process_jobs(&kc->io_jobs, kc, run_io_job);
+ unplug(kc, READ);
+ unplug(kc, WRITE);
}
/*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
+ memset(kc->unplug, 0, sizeof(kc->unplug));
+
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)
goto bad_slab;
INIT_WORK(&kc->kcopyd_work, do_work);
- kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+ kc->kcopyd_wq = alloc_workqueue("kcopyd",
+ WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!kc->kcopyd_wq)
goto bad_workqueue;
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064..aa2e0c374ab 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
#include "dm-log-userspace-transfer.h"
+#define DM_LOG_USERSPACE_VSN "1.1.0"
+
struct flush_entry {
int type;
region_t region;
struct list_head list;
};
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary. However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
+
struct log_c {
struct dm_target *ti;
uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
*/
uint64_t in_sync_hint;
+ /*
+ * Mark and clear requests are held until a flush is issued
+ * so that we can group, and thereby limit, the amount of
+ * network traffic between kernel and userspace. The 'flush_lock'
+ * is used to protect these lists.
+ */
spinlock_t flush_lock;
- struct list_head flush_list; /* only for clear and mark requests */
+ struct list_head mark_list;
+ struct list_head clear_list;
};
static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
strncpy(lc->uuid, argv[0], DM_UUID_LEN);
spin_lock_init(&lc->flush_lock);
- INIT_LIST_HEAD(&lc->flush_list);
+ INIT_LIST_HEAD(&lc->mark_list);
+ INIT_LIST_HEAD(&lc->clear_list);
str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
ctr_str, str_size, NULL, NULL);
- if (r == -ESRCH) {
- DMERR("Userspace log server not found");
+ if (r < 0) {
+ if (r == -ESRCH)
+ DMERR("Userspace log server not found");
+ else
+ DMERR("Userspace log server failed to create log");
goto out;
}
@@ -214,10 +235,9 @@ out:
static void userspace_dtr(struct dm_dirty_log *log)
{
- int r;
struct log_c *lc = log->context;
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+ (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
NULL, 0,
NULL, NULL);
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
return (r) ? 0 : (int)in_sync;
}
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+ int r = 0;
+ struct flush_entry *fe;
+
+ list_for_each_entry(fe, flush_list, list) {
+ r = userspace_do_request(lc, lc->uuid, fe->type,
+ (char *)&fe->region,
+ sizeof(fe->region),
+ NULL, NULL);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+ int r = 0;
+ int count;
+ uint32_t type = 0;
+ struct flush_entry *fe, *tmp_fe;
+ LIST_HEAD(tmp_list);
+ uint64_t group[MAX_FLUSH_GROUP_COUNT];
+
+ /*
+ * Group process the requests
+ */
+ while (!list_empty(flush_list)) {
+ count = 0;
+
+ list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+ group[count] = fe->region;
+ count++;
+
+ list_del(&fe->list);
+ list_add(&fe->list, &tmp_list);
+
+ type = fe->type;
+ if (count >= MAX_FLUSH_GROUP_COUNT)
+ break;
+ }
+
+ r = userspace_do_request(lc, lc->uuid, type,
+ (char *)(group),
+ count * sizeof(uint64_t),
+ NULL, NULL);
+ if (r) {
+ /* Group send failed. Attempt one-by-one. */
+ list_splice_init(&tmp_list, flush_list);
+ r = flush_one_by_one(lc, flush_list);
+ break;
+ }
+ }
+
+ /*
+ * Must collect flush_entrys that were successfully processed
+ * as a group so that they will be free'd by the caller.
+ */
+ list_splice_init(&tmp_list, flush_list);
+
+ return r;
+}
+
/*
* userspace_flush
*
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
int r = 0;
unsigned long flags;
struct log_c *lc = log->context;
- LIST_HEAD(flush_list);
+ LIST_HEAD(mark_list);
+ LIST_HEAD(clear_list);
struct flush_entry *fe, *tmp_fe;
spin_lock_irqsave(&lc->flush_lock, flags);
- list_splice_init(&lc->flush_list, &flush_list);
+ list_splice_init(&lc->mark_list, &mark_list);
+ list_splice_init(&lc->clear_list, &clear_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
- if (list_empty(&flush_list))
+ if (list_empty(&mark_list) && list_empty(&clear_list))
return 0;
- /*
- * FIXME: Count up requests, group request types,
- * allocate memory to stick all requests in and
- * send to server in one go. Failing the allocation,
- * do it one by one.
- */
+ r = flush_by_group(lc, &mark_list);
+ if (r)
+ goto fail;
- list_for_each_entry(fe, &flush_list, list) {
- r = userspace_do_request(lc, lc->uuid, fe->type,
- (char *)&fe->region,
- sizeof(fe->region),
- NULL, NULL);
- if (r)
- goto fail;
- }
+ r = flush_by_group(lc, &clear_list);
+ if (r)
+ goto fail;
r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
* Calling code will receive an error and will know that
* the log facility has failed.
*/
- list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+ list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+ list_del(&fe->list);
+ mempool_free(fe, flush_entry_pool);
+ }
+ list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
list_del(&fe->list);
mempool_free(fe, flush_entry_pool);
}
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
spin_lock_irqsave(&lc->flush_lock, flags);
fe->type = DM_ULOG_MARK_REGION;
fe->region = region;
- list_add(&fe->list, &lc->flush_list);
+ list_add(&fe->list, &lc->mark_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
spin_lock_irqsave(&lc->flush_lock, flags);
fe->type = DM_ULOG_CLEAR_REGION;
fe->region = region;
- list_add(&fe->list, &lc->flush_list);
+ list_add(&fe->list, &lc->clear_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
return r;
}
- DMINFO("version 1.0.0 loaded");
+ DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
return 0;
}
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
dm_ulog_tfr_exit();
mempool_destroy(flush_entry_pool);
- DMINFO("version 1.0.0 unloaded");
+ DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
return;
}
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f..049eaf12aaa 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+ tfr->version = DM_ULOG_REQUEST_VERSION;
tfr->luid = luid;
tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d15..6951536ea29 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
r = PTR_ERR(lc->io_req.client);
DMWARN("couldn't allocate disk io client");
kfree(lc);
- return -ENOMEM;
+ return r;
}
lc->disk_header = vmalloc(buf_size);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad..b82d28819e2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
#define DM_MSG_PREFIX "multipath"
#define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
/* Path properties */
struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
unsigned fail_count; /* Cumulative failure count */
struct dm_path path;
- struct work_struct deactivate_path;
- struct work_struct activate_path;
+ struct delayed_work activate_path;
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
const char *hw_handler_name;
char *hw_handler_params;
+
unsigned nr_priority_groups;
struct list_head priority_groups;
+
+ wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+
unsigned pg_init_required; /* pg_init needs calling? */
unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
- wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+ unsigned pg_init_delay_retry; /* Delay pg_init retry? */
unsigned nr_valid_paths; /* Total number of usable paths */
struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
unsigned saved_queue_if_no_path;/* Saved state during suspension */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
+ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
struct work_struct process_queued_ios;
struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void process_queued_ios(struct work_struct *work);
static void trigger_event(struct work_struct *work);
static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
/*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
if (pgpath) {
pgpath->is_active = 1;
- INIT_WORK(&pgpath->deactivate_path, deactivate_path);
- INIT_WORK(&pgpath->activate_path, activate_path);
+ INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
}
return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
kfree(pgpath);
}
-static void deactivate_path(struct work_struct *work)
-{
- struct pgpath *pgpath =
- container_of(work, struct pgpath, deactivate_path);
-
- blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
-
static struct priority_group *alloc_priority_group(void)
{
struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
INIT_LIST_HEAD(&m->queued_ios);
spin_lock_init(&m->lock);
m->queue_io = 1;
+ m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
INIT_WORK(&m->trigger_event, trigger_event);
init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
static void __pg_init_all_paths(struct multipath *m)
{
struct pgpath *pgpath;
+ unsigned long pg_init_delay = 0;
m->pg_init_count++;
m->pg_init_required = 0;
+ if (m->pg_init_delay_retry)
+ pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+ m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
/* Skip failed paths */
if (!pgpath->is_active)
continue;
- if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+ if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+ pg_init_delay))
m->pg_init_in_progress++;
}
}
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
const char *param_name;
static struct param _params[] = {
- {0, 3, "invalid number of feature args"},
+ {0, 5, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
+ {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
};
r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
continue;
}
+ if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+ (argc >= 1)) {
+ r = read_param(_params + 2, shift(as),
+ &m->pg_init_delay_msecs, &ti->error);
+ argc--;
+ continue;
+ }
+
ti->error = "Unrecognised multipath feature request";
r = -EINVAL;
} while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
- flush_scheduled_work();
+ flush_work_sync(&m->trigger_event);
}
static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
pgpath->path.dev->name, m->nr_valid_paths);
schedule_work(&m->trigger_event);
- queue_work(kmultipathd, &pgpath->deactivate_path);
out:
spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
m->current_pgpath = NULL;
queue_work(kmultipathd, &m->process_queued_ios);
} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
- if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+ if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
m->pg_init_in_progress++;
}
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
struct priority_group *pg = pgpath->pg;
struct multipath *m = pg->m;
unsigned long flags;
+ unsigned delay_retry = 0;
/* device or driver problems */
switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
*/
bypass_pg(m, pg, 1);
break;
- /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
case SCSI_DH_RETRY:
+ /* Wait before retrying. */
+ delay_retry = 1;
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
if (!m->pg_init_required)
m->queue_io = 0;
+ m->pg_init_delay_retry = delay_retry;
queue_work(kmultipathd, &m->process_queued_ios);
/*
@@ -1241,7 +1254,7 @@ out:
static void activate_path(struct work_struct *work)
{
struct pgpath *pgpath =
- container_of(work, struct pgpath, activate_path);
+ container_of(work, struct pgpath, activate_path.work);
scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
else {
DMEMIT("%u ", m->queue_if_no_path +
- (m->pg_init_retries > 0) * 2);
+ (m->pg_init_retries > 0) * 2 +
+ (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
if (m->queue_if_no_path)
DMEMIT("queue_if_no_path ");
if (m->pg_init_retries)
DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+ if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+ DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
}
if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 1, 1},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
return -EINVAL;
}
- kmultipathd = create_workqueue("kmpathd");
+ kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
* old workqueue would also create a bottleneck in the
* path of the storage hardware device activation.
*/
- kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+ kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+ WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 00000000000..b9e1e15ef11
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/slab.h>
+
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+
+#define DM_MSG_PREFIX "raid"
+
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+
+struct raid_dev {
+ /*
+ * Two DM devices, one to hold metadata and one to hold the
+ * actual data/parity. The reason for this is to not confuse
+ * ti->len and give more flexibility in altering size and
+ * characteristics.
+ *
+ * While it is possible for this device to be associated
+ * with a different physical device than the data_dev, it
+ * is intended for it to be the same.
+ * |--------- Physical Device ---------|
+ * |- meta_dev -|------ data_dev ------|
+ */
+ struct dm_dev *meta_dev;
+ struct dm_dev *data_dev;
+ struct mdk_rdev_s rdev;
+};
+
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP 0x1
+#define DMPF_MAX_WRITE_BEHIND 0x2
+#define DMPF_SYNC 0x4
+#define DMPF_NOSYNC 0x8
+#define DMPF_STRIPE_CACHE 0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+
+struct raid_set {
+ struct dm_target *ti;
+
+ uint64_t print_flags;
+
+ struct mddev_s md;
+ struct raid_type *raid_type;
+ struct dm_target_callbacks callbacks;
+
+ struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+ const char *name; /* RAID algorithm. */
+ const char *descr; /* Descriptor text for logging. */
+ const unsigned parity_devs; /* # of parity devices. */
+ const unsigned minimal_devs; /* minimal # of devices in set. */
+ const unsigned level; /* RAID level. */
+ const unsigned algorithm; /* RAID algorithm. */
+} raid_types[] = {
+ {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
+ {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+ {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+ {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+ {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+ {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+ {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static struct raid_type *get_raid_type(char *name)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+ if (!strcmp(raid_types[i].name, name))
+ return &raid_types[i];
+
+ return NULL;
+}
+
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+ unsigned i;
+ struct raid_set *rs;
+ sector_t sectors_per_dev;
+
+ if (raid_devs <= raid_type->parity_devs) {
+ ti->error = "Insufficient number of devices";
+ return ERR_PTR(-EINVAL);
+ }
+
+ sectors_per_dev = ti->len;
+ if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+ ti->error = "Target length not divisible by number of data devices";
+ return ERR_PTR(-EINVAL);
+ }
+
+ rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+ if (!rs) {
+ ti->error = "Cannot allocate raid context";
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mddev_init(&rs->md);
+
+ rs->ti = ti;
+ rs->raid_type = raid_type;
+ rs->md.raid_disks = raid_devs;
+ rs->md.level = raid_type->level;
+ rs->md.new_level = rs->md.level;
+ rs->md.dev_sectors = sectors_per_dev;
+ rs->md.layout = raid_type->algorithm;
+ rs->md.new_layout = rs->md.layout;
+ rs->md.delta_disks = 0;
+ rs->md.recovery_cp = 0;
+
+ for (i = 0; i < raid_devs; i++)
+ md_rdev_init(&rs->dev[i].rdev);
+
+ /*
+ * Remaining items to be initialized by further RAID params:
+ * rs->md.persistent
+ * rs->md.external
+ * rs->md.chunk_sectors
+ * rs->md.new_chunk_sectors
+ */
+
+ return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+ int i;
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev)
+ dm_put_device(rs->ti, rs->dev[i].data_dev);
+
+ kfree(rs);
+}
+
+/*
+ * For every device we have two words
+ * <meta_dev>: meta device name or '-' if missing
+ * <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+ int i;
+ int rebuild = 0;
+ int metadata_available = 0;
+ int ret = 0;
+
+ for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+ rs->dev[i].rdev.raid_disk = i;
+
+ rs->dev[i].meta_dev = NULL;
+ rs->dev[i].data_dev = NULL;
+
+ /*
+ * There are no offsets, since there is a separate device
+ * for data and metadata.
+ */
+ rs->dev[i].rdev.data_offset = 0;
+ rs->dev[i].rdev.mddev = &rs->md;
+
+ if (strcmp(argv[0], "-")) {
+ rs->ti->error = "Metadata devices not supported";
+ return -EINVAL;
+ }
+
+ if (!strcmp(argv[1], "-")) {
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+ (!rs->dev[i].rdev.recovery_offset)) {
+ rs->ti->error = "Drive designated for rebuild not specified";
+ return -EINVAL;
+ }
+
+ continue;
+ }
+
+ ret = dm_get_device(rs->ti, argv[1],
+ dm_table_get_mode(rs->ti->table),
+ &rs->dev[i].data_dev);
+ if (ret) {
+ rs->ti->error = "RAID device lookup failure";
+ return ret;
+ }
+
+ rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+ list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ rebuild++;
+ }
+
+ if (metadata_available) {
+ rs->md.external = 0;
+ rs->md.persistent = 1;
+ rs->md.major_version = 2;
+ } else if (rebuild && !rs->md.recovery_cp) {
+ /*
+ * Without metadata, we will not be able to tell if the array
+ * is in-sync or not - we must assume it is not. Therefore,
+ * it is impossible to rebuild a drive.
+ *
+ * Even if there is metadata, the on-disk information may
+ * indicate that the array is not in-sync and it will then
+ * fail at that time.
+ *
+ * User could specify 'nosync' option if desperate.
+ */
+ DMERR("Unable to rebuild drive while array is not in-sync");
+ rs->ti->error = "RAID device lookup failure";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Possible arguments are...
+ * RAID456:
+ * <chunk_size> [optional_args]
+ *
+ * Optional args:
+ * [[no]sync] Force or prevent recovery of the entire array
+ * [rebuild <idx>] Rebuild the drive indicated by the index
+ * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
+ * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
+ * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+ unsigned num_raid_params)
+{
+ unsigned i, rebuild_cnt = 0;
+ unsigned long value;
+ char *key;
+
+ /*
+ * First, parse the in-order required arguments
+ */
+ if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+ !is_power_of_2(value) || (value < 8)) {
+ rs->ti->error = "Bad chunk size";
+ return -EINVAL;
+ }
+
+ rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+ argv++;
+ num_raid_params--;
+
+ /*
+ * Second, parse the unordered optional arguments
+ */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ set_bit(In_sync, &rs->dev[i].rdev.flags);
+
+ for (i = 0; i < num_raid_params; i++) {
+ if (!strcmp(argv[i], "nosync")) {
+ rs->md.recovery_cp = MaxSector;
+ rs->print_flags |= DMPF_NOSYNC;
+ rs->md.flags |= MD_SYNC_STATE_FORCED;
+ continue;
+ }
+ if (!strcmp(argv[i], "sync")) {
+ rs->md.recovery_cp = 0;
+ rs->print_flags |= DMPF_SYNC;
+ rs->md.flags |= MD_SYNC_STATE_FORCED;
+ continue;
+ }
+
+ /* The rest of the optional arguments come in key/value pairs */
+ if ((i + 1) >= num_raid_params) {
+ rs->ti->error = "Wrong number of raid parameters given";
+ return -EINVAL;
+ }
+
+ key = argv[i++];
+ if (strict_strtoul(argv[i], 10, &value) < 0) {
+ rs->ti->error = "Bad numerical argument given in raid params";
+ return -EINVAL;
+ }
+
+ if (!strcmp(key, "rebuild")) {
+ if (++rebuild_cnt > rs->raid_type->parity_devs) {
+ rs->ti->error = "Too many rebuild drives given";
+ return -EINVAL;
+ }
+ if (value > rs->md.raid_disks) {
+ rs->ti->error = "Invalid rebuild index given";
+ return -EINVAL;
+ }
+ clear_bit(In_sync, &rs->dev[value].rdev.flags);
+ rs->dev[value].rdev.recovery_offset = 0;
+ } else if (!strcmp(key, "max_write_behind")) {
+ rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+
+ /*
+ * In device-mapper, we specify things in sectors, but
+ * MD records this value in kB
+ */
+ value /= 2;
+ if (value > COUNTER_MAX) {
+ rs->ti->error = "Max write-behind limit out of range";
+ return -EINVAL;
+ }
+ rs->md.bitmap_info.max_write_behind = value;
+ } else if (!strcmp(key, "daemon_sleep")) {
+ rs->print_flags |= DMPF_DAEMON_SLEEP;
+ if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+ rs->ti->error = "daemon sleep period out of range";
+ return -EINVAL;
+ }
+ rs->md.bitmap_info.daemon_sleep = value;
+ } else if (!strcmp(key, "stripe_cache")) {
+ rs->print_flags |= DMPF_STRIPE_CACHE;
+
+ /*
+ * In device-mapper, we specify things in sectors, but
+ * MD records this value in kB
+ */
+ value /= 2;
+
+ if (rs->raid_type->level < 5) {
+ rs->ti->error = "Inappropriate argument: stripe_cache";
+ return -EINVAL;
+ }
+ if (raid5_set_cache_size(&rs->md, (int)value)) {
+ rs->ti->error = "Bad stripe_cache size";
+ return -EINVAL;
+ }
+ } else if (!strcmp(key, "min_recovery_rate")) {
+ rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+ if (value > INT_MAX) {
+ rs->ti->error = "min_recovery_rate out of range";
+ return -EINVAL;
+ }
+ rs->md.sync_speed_min = (int)value;
+ } else if (!strcmp(key, "max_recovery_rate")) {
+ rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+ if (value > INT_MAX) {
+ rs->ti->error = "max_recovery_rate out of range";
+ return -EINVAL;
+ }
+ rs->md.sync_speed_max = (int)value;
+ } else {
+ DMERR("Unable to parse RAID parameter: %s", key);
+ rs->ti->error = "Unable to parse RAID parameters";
+ return -EINVAL;
+ }
+ }
+
+ /* Assume there are no metadata devices until the drives are parsed */
+ rs->md.persistent = 0;
+ rs->md.external = 1;
+
+ return 0;
+}
+
+static void do_table_event(struct work_struct *ws)
+{
+ struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+
+ dm_table_event(rs->ti->table);
+}
+
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+ struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+ return md_raid5_congested(&rs->md, bits);
+}
+
+static void raid_unplug(struct dm_target_callbacks *cb)
+{
+ struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+ md_raid5_unplug_device(rs->md.private);
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ * <raid_type> <#raid_params> <raid_params> \
+ * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int ret;
+ struct raid_type *rt;
+ unsigned long num_raid_params, num_raid_devs;
+ struct raid_set *rs = NULL;
+
+ /* Must have at least <raid_type> <#raid_params> */
+ if (argc < 2) {
+ ti->error = "Too few arguments";
+ return -EINVAL;
+ }
+
+ /* raid type */
+ rt = get_raid_type(argv[0]);
+ if (!rt) {
+ ti->error = "Unrecognised raid_type";
+ return -EINVAL;
+ }
+ argc--;
+ argv++;
+
+ /* number of RAID parameters */
+ if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+ ti->error = "Cannot understand number of RAID parameters";
+ return -EINVAL;
+ }
+ argc--;
+ argv++;
+
+ /* Skip over RAID params for now and find out # of devices */
+ if (num_raid_params + 1 > argc) {
+ ti->error = "Arguments do not agree with counts given";
+ return -EINVAL;
+ }
+
+ if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+ (num_raid_devs >= INT_MAX)) {
+ ti->error = "Cannot understand number of raid devices";
+ return -EINVAL;
+ }
+
+ rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+ if (IS_ERR(rs))
+ return PTR_ERR(rs);
+
+ ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+
+ argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+ argv += num_raid_params + 1;
+
+ if (argc != (num_raid_devs * 2)) {
+ ti->error = "Supplied RAID devices does not match the count given";
+ goto bad;
+ }
+
+ ret = dev_parms(rs, argv);
+ if (ret)
+ goto bad;
+
+ INIT_WORK(&rs->md.event_work, do_table_event);
+ ti->split_io = rs->md.chunk_sectors;
+ ti->private = rs;
+
+ mutex_lock(&rs->md.reconfig_mutex);
+ ret = md_run(&rs->md);
+ rs->md.in_sync = 0; /* Assume already marked dirty */
+ mutex_unlock(&rs->md.reconfig_mutex);
+
+ if (ret) {
+ ti->error = "Fail to run raid array";
+ goto bad;
+ }
+
+ rs->callbacks.congested_fn = raid_is_congested;
+ rs->callbacks.unplug_fn = raid_unplug;
+ dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+
+ return 0;
+
+bad:
+ context_free(rs);
+
+ return ret;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ list_del_init(&rs->callbacks.list);
+ md_stop(&rs->md);
+ context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+ struct raid_set *rs = ti->private;
+ mddev_t *mddev = &rs->md;
+
+ mddev->pers->make_request(mddev, bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ struct raid_set *rs = ti->private;
+ unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+ unsigned sz = 0;
+ int i;
+ sector_t sync;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+ DMEMIT("D");
+ else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+ DMEMIT("A");
+ else
+ DMEMIT("a");
+ }
+
+ if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+ sync = rs->md.curr_resync_completed;
+ else
+ sync = rs->md.recovery_cp;
+
+ if (sync > rs->md.resync_max_sectors)
+ sync = rs->md.resync_max_sectors;
+
+ DMEMIT(" %llu/%llu",
+ (unsigned long long) sync,
+ (unsigned long long) rs->md.resync_max_sectors);
+
+ break;
+ case STATUSTYPE_TABLE:
+ /* The string you would use to construct this array */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev &&
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ raid_param_cnt++; /* for rebuilds */
+
+ raid_param_cnt += (hweight64(rs->print_flags) * 2);
+ if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+ raid_param_cnt--;
+
+ DMEMIT("%s %u %u", rs->raid_type->name,
+ raid_param_cnt, rs->md.chunk_sectors);
+
+ if ((rs->print_flags & DMPF_SYNC) &&
+ (rs->md.recovery_cp == MaxSector))
+ DMEMIT(" sync");
+ if (rs->print_flags & DMPF_NOSYNC)
+ DMEMIT(" nosync");
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev &&
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ DMEMIT(" rebuild %u", i);
+
+ if (rs->print_flags & DMPF_DAEMON_SLEEP)
+ DMEMIT(" daemon_sleep %lu",
+ rs->md.bitmap_info.daemon_sleep);
+
+ if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+ DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+
+ if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+ DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+
+ if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+ DMEMIT(" max_write_behind %lu",
+ rs->md.bitmap_info.max_write_behind);
+
+ if (rs->print_flags & DMPF_STRIPE_CACHE) {
+ raid5_conf_t *conf = rs->md.private;
+
+ /* convert from kiB to sectors */
+ DMEMIT(" stripe_cache %d",
+ conf ? conf->max_nr_stripes * 2 : 0);
+ }
+
+ DMEMIT(" %d", rs->md.raid_disks);
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ DMEMIT(" -"); /* metadata device */
+
+ if (rs->dev[i].data_dev)
+ DMEMIT(" %s", rs->dev[i].data_dev->name);
+ else
+ DMEMIT(" -");
+ }
+ }
+
+ return 0;
+}
+
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+ struct raid_set *rs = ti->private;
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; !ret && i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev)
+ ret = fn(ti,
+ rs->dev[i].data_dev,
+ 0, /* No offset on data devs */
+ rs->md.dev_sectors,
+ data);
+
+ return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct raid_set *rs = ti->private;
+ unsigned chunk_size = rs->md.chunk_sectors << 9;
+ raid5_conf_t *conf = rs->md.private;
+
+ blk_limits_io_min(limits, chunk_size);
+ blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+
+static void raid_presuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ md_stop_writes(&rs->md);
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ mddev_suspend(&rs->md);
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ mddev_resume(&rs->md);
+}
+
+static struct target_type raid_target = {
+ .name = "raid",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = raid_ctr,
+ .dtr = raid_dtr,
+ .map = raid_map,
+ .status = raid_status,
+ .iterate_devices = raid_iterate_devices,
+ .io_hints = raid_io_hints,
+ .presuspend = raid_presuspend,
+ .postsuspend = raid_postsuspend,
+ .resume = raid_resume,
+};
+
+static int __init dm_raid_init(void)
+{
+ return dm_register_target(&raid_target);
+}
+
+static void __exit dm_raid_exit(void)
+{
+ dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c2..dee326775c6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
struct dm_io_request io_req = {
.bi_rw = WRITE_FLUSH,
.mem.type = DM_IO_KMEM,
- .mem.ptr.bvec = NULL,
+ .mem.ptr.addr = NULL,
.client = ms->io_client,
};
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
.client = ms->io_client,
};
+ if (bio->bi_rw & REQ_DISCARD) {
+ io_req.bi_rw |= REQ_DISCARD;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = NULL;
+ }
+
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
map_region(dest++, m, bio);
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
- if (bio->bi_rw & REQ_FLUSH) {
+ if ((bio->bi_rw & REQ_FLUSH) ||
+ (bio->bi_rw & REQ_DISCARD)) {
bio_list_add(&sync, bio);
continue;
}
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = ms;
ti->split_io = dm_rh_get_region_size(ms->rh);
ti->num_flush_requests = 1;
+ ti->num_discard_requests = 1;
- ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+ ms->kmirrord_wq = alloc_workqueue("kmirrord",
+ WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!ms->kmirrord_wq) {
DMERR("couldn't start kmirrord");
r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
del_timer_sync(&ms->timer);
flush_workqueue(ms->kmirrord_wq);
- flush_scheduled_work();
+ flush_work_sync(&ms->trigger_event);
dm_kcopyd_client_destroy(ms->kcopyd_client);
destroy_workqueue(ms->kmirrord_wq);
free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 12, 0},
+ .version = {1, 12, 1},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115d..95891dfcbca 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
*/
INIT_WORK_ONSTACK(&req.work, do_metadata);
queue_work(ps->metadata_wq, &req.work);
- flush_workqueue(ps->metadata_wq);
+ flush_work(&req.work);
return req.result;
}
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
atomic_set(&ps->pending_count, 0);
ps->callbacks = NULL;
- ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+ ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
if (!ps->metadata_wq) {
kfree(ps);
DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcb..fdde53cd12b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
#include "dm-exception-store.h"
@@ -80,9 +79,6 @@ struct dm_snapshot {
/* Origin writes don't trigger exceptions until this is set */
int active;
- /* Whether or not owning mapped_device is suspended */
- int suspended;
-
atomic_t pending_exceptions_count;
mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
struct dm_kcopyd_client *kcopyd_client;
- /* Queue of snapshot writes for ksnapd to flush */
- struct bio_list queued_bios;
- struct work_struct queued_bios_work;
-
/* Wait for events based on state_bits */
unsigned long state_bits;
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
}
EXPORT_SYMBOL(dm_snap_cow);
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
-
static sector_t chunk_to_sector(struct dm_exception_store *store,
chunk_t chunk)
{
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->ti = ti;
s->valid = 1;
s->active = 0;
- s->suspended = 0;
atomic_set(&s->pending_exceptions_count, 0);
init_rwsem(&s->lock);
INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
spin_lock_init(&s->tracked_chunk_lock);
- bio_list_init(&s->queued_bios);
- INIT_WORK(&s->queued_bios_work, flush_queued_bios);
-
ti->private = s;
ti->num_flush_requests = num_flush_requests;
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
struct dm_snapshot *s = ti->private;
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
- flush_workqueue(ksnapd);
-
down_read(&_origins_lock);
/* Check whether exception handover must be cancelled */
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
}
}
-static void flush_queued_bios(struct work_struct *work)
-{
- struct dm_snapshot *s =
- container_of(work, struct dm_snapshot, queued_bios_work);
- struct bio *queued_bios;
- unsigned long flags;
-
- spin_lock_irqsave(&s->pe_lock, flags);
- queued_bios = bio_list_get(&s->queued_bios);
- spin_unlock_irqrestore(&s->pe_lock, flags);
-
- flush_bios(queued_bios);
-}
-
static int do_origin(struct dm_dev *origin, struct bio *bio);
/*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
stop_merge(s);
}
-static void snapshot_postsuspend(struct dm_target *ti)
-{
- struct dm_snapshot *s = ti->private;
-
- down_write(&s->lock);
- s->suspended = 1;
- up_write(&s->lock);
-}
-
static int snapshot_preresume(struct dm_target *ti)
{
int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
DMERR("Unable to resume snapshot source until "
"handover completes.");
r = -EINVAL;
- } else if (!snap_src->suspended) {
+ } else if (!dm_suspended(snap_src->ti)) {
DMERR("Unable to perform snapshot handover until "
"source is suspended.");
r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
down_write(&s->lock);
s->active = 1;
- s->suspended = 0;
up_write(&s->lock);
}
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 7, 0},
+ .version = {1, 7, 1},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
.map = snapshot_map,
.end_io = snapshot_end_io,
- .postsuspend = snapshot_postsuspend,
.preresume = snapshot_preresume,
.resume = snapshot_resume,
.status = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
.map = snapshot_merge_map,
.end_io = snapshot_end_io,
.presuspend = snapshot_merge_presuspend,
- .postsuspend = snapshot_postsuspend,
.preresume = snapshot_preresume,
.resume = snapshot_merge_resume,
.status = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
goto bad_tracked_chunk_cache;
}
- ksnapd = create_singlethread_workqueue("ksnapd");
- if (!ksnapd) {
- DMERR("Failed to create ksnapd workqueue.");
- r = -ENOMEM;
- goto bad_pending_pool;
- }
-
return 0;
-bad_pending_pool:
- kmem_cache_destroy(tracked_chunk_cache);
bad_tracked_chunk_cache:
kmem_cache_destroy(pending_cache);
bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
static void __exit dm_snapshot_exit(void)
{
- destroy_workqueue(ksnapd);
-
dm_unregister_target(&snapshot_target);
dm_unregister_target(&origin_target);
dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fb..dddfa14f298 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
struct dm_target *ti;
/* Work struct used for triggering events*/
- struct work_struct kstriped_ws;
+ struct work_struct trigger_event;
struct stripe stripe[0];
};
-static struct workqueue_struct *kstriped;
-
/*
* An event is triggered whenever a drive
* drops out of a stripe volume.
*/
static void trigger_event(struct work_struct *work)
{
- struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
-
+ struct stripe_c *sc = container_of(work, struct stripe_c,
+ trigger_event);
dm_table_event(sc->ti->table);
-
}
static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
- INIT_WORK(&sc->kstriped_ws, trigger_event);
+ INIT_WORK(&sc->trigger_event, trigger_event);
/* Set pointer to dm target; used in trigger_event */
sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
for (i = 0; i < sc->stripes; i++)
dm_put_device(ti, sc->stripe[i].dev);
- flush_workqueue(kstriped);
+ flush_work_sync(&sc->trigger_event);
kfree(sc);
}
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
atomic_inc(&(sc->stripe[i].error_count));
if (atomic_read(&(sc->stripe[i].error_count)) <
DM_IO_ERROR_THRESHOLD)
- queue_work(kstriped, &sc->kstriped_ws);
+ schedule_work(&sc->trigger_event);
}
return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
static struct target_type stripe_target = {
.name = "striped",
- .version = {1, 3, 0},
+ .version = {1, 3, 1},
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
return r;
}
- kstriped = create_singlethread_workqueue("kstriped");
- if (!kstriped) {
- DMERR("failed to create workqueue kstriped");
- dm_unregister_target(&stripe_target);
- return -ENOMEM;
- }
-
return r;
}
void dm_stripe_exit(void)
{
dm_unregister_target(&stripe_target);
- destroy_workqueue(kstriped);
-
- return;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30..38e4eb1bb96 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
void *event_context;
struct dm_md_mempools *mempools;
+
+ struct list_head target_callbacks;
};
/*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
return -ENOMEM;
INIT_LIST_HEAD(&t->devices);
+ INIT_LIST_HEAD(&t->target_callbacks);
atomic_set(&t->holders, 0);
t->discards_supported = 1;
@@ -347,6 +350,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
if (!d->dm_dev.bdev)
return;
+ bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md));
blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
d->dm_dev.bdev = NULL;
}
@@ -1225,10 +1229,17 @@ int dm_table_resume_targets(struct dm_table *t)
return 0;
}
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+ list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
+
int dm_table_any_congested(struct dm_table *t, int bdi_bits)
{
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
+ struct dm_target_callbacks *cb;
int r = 0;
list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1254,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
bdevname(dd->dm_dev.bdev, b));
}
+ list_for_each_entry(cb, &t->target_callbacks, list)
+ if (cb->congested_fn)
+ r |= cb->congested_fn(cb, bdi_bits);
+
return r;
}
@@ -1264,6 +1279,7 @@ void dm_table_unplug_all(struct dm_table *t)
{
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
+ struct dm_target_callbacks *cb;
list_for_each_entry(dd, devices, list) {
struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1292,10 @@ void dm_table_unplug_all(struct dm_table *t)
dm_device_name(t->md),
bdevname(dd->dm_dev.bdev, b));
}
+
+ list_for_each_entry(cb, &t->target_callbacks, list)
+ if (cb->unplug_fn)
+ cb->unplug_fn(cb);
}
struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac..eaa3af0e063 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24
-static DEFINE_MUTEX(dm_mutex);
static const char *_name = DM_NAME;
static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
- mutex_lock(&dm_mutex);
spin_lock(&_minor_lock);
md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
out:
spin_unlock(&_minor_lock);
- mutex_unlock(&dm_mutex);
return md ? 0 : -ENXIO;
}
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
{
struct mapped_device *md = disk->private_data;
- mutex_lock(&dm_mutex);
+ spin_lock(&_minor_lock);
+
atomic_dec(&md->open_count);
dm_put(md);
- mutex_unlock(&dm_mutex);
+
+ spin_unlock(&_minor_lock);
return 0;
}
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
if (map_request(ti, clone, md))
goto requeued;
- spin_lock_irq(q->queue_lock);
+ BUG_ON(!irqs_disabled());
+ spin_lock(q->queue_lock);
}
goto out;
requeued:
- spin_lock_irq(q->queue_lock);
+ BUG_ON(!irqs_disabled());
+ spin_lock(q->queue_lock);
plug_and_out:
if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
- md->wq = create_singlethread_workqueue("kdmflush");
+ md->wq = alloc_workqueue("kdmflush",
+ WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!md->wq)
goto bad_thread;
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
wake_up(&md->eventq);
}
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
static void __set_size(struct mapped_device *md, sector_t size)
{
set_capacity(md->disk, size);
- mutex_lock(&md->bdev->bd_inode->i_mutex);
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
- mutex_unlock(&md->bdev->bd_inode->i_mutex);
}
/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fc090ac9e2..0cc30ecda4c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -287,11 +287,14 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
mddev_t *mddev = q->queuedata;
int rv;
int cpu;
+ unsigned int sectors;
- if (mddev == NULL || mddev->pers == NULL) {
+ if (mddev == NULL || mddev->pers == NULL
+ || !mddev->ready) {
bio_io_error(bio);
return 0;
}
+ smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
if (mddev->suspended) {
DEFINE_WAIT(__wait);
@@ -309,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
atomic_inc(&mddev->active_io);
rcu_read_unlock();
+ /*
+ * save the sectors now since our bio can
+ * go away inside make_request
+ */
+ sectors = bio_sectors(bio);
rv = mddev->pers->make_request(mddev, bio);
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
- bio_sectors(bio));
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
part_stat_unlock();
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
@@ -703,9 +710,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
}
/* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
{
- sector_t num_sectors = i_size_read(bdev->bd_inode) / 512;
+ sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
return MD_NEW_SIZE_SECTORS(num_sectors);
}
@@ -763,7 +770,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
*/
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
- bio->bi_bdev = rdev->bdev;
+ bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
@@ -793,7 +800,7 @@ static void bi_complete(struct bio *bio, int error)
}
int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
- struct page *page, int rw)
+ struct page *page, int rw, bool metadata_op)
{
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
struct completion event;
@@ -801,8 +808,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
rw |= REQ_SYNC | REQ_UNPLUG;
- bio->bi_bdev = rdev->bdev;
- bio->bi_sector = sector;
+ bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+ rdev->meta_bdev : rdev->bdev;
+ if (metadata_op)
+ bio->bi_sector = sector + rdev->sb_start;
+ else
+ bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
init_completion(&event);
bio->bi_private = &event;
@@ -827,7 +838,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0;
- if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
goto fail;
rdev->sb_loaded = 1;
return 0;
@@ -989,7 +1000,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
*
* It also happens to be a multiple of 4Kb.
*/
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
@@ -1330,7 +1341,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_info.offset)
return 0; /* can't move bitmap */
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
if (!num_sectors || num_sectors > rdev->sb_start)
num_sectors = rdev->sb_start;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -1906,6 +1917,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
MD_BUG();
return;
}
+ bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
@@ -1940,8 +1952,6 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
__bdevname(dev, b));
return PTR_ERR(bdev);
}
- if (!shared)
- set_bit(AllReserved, &rdev->flags);
rdev->bdev = bdev;
return err;
}
@@ -2458,6 +2468,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (rdev->raid_disk != -1)
return -EBUSY;
+ if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
+ return -EBUSY;
+
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
@@ -2465,6 +2478,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (rdev2->raid_disk == slot)
return -EEXIST;
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+ return -ENOSPC;
+
rdev->raid_disk = slot;
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = slot;
@@ -2482,7 +2499,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
/* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
- if (slot >= rdev->mddev->raid_disks)
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
rdev->raid_disk = slot;
/* assume it is working */
@@ -2598,12 +2616,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
mddev_lock(mddev);
list_for_each_entry(rdev2, &mddev->disks, same_set)
- if (test_bit(AllReserved, &rdev2->flags) ||
- (rdev->bdev == rdev2->bdev &&
- rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->sectors,
- rdev2->data_offset,
- rdev2->sectors))) {
+ if (rdev->bdev == rdev2->bdev &&
+ rdev != rdev2 &&
+ overlaps(rdev->data_offset, rdev->sectors,
+ rdev2->data_offset,
+ rdev2->sectors)) {
overlap = 1;
break;
}
@@ -3107,7 +3124,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
char nm[20];
if (rdev->raid_disk < 0)
continue;
- if (rdev->new_raid_disk > mddev->raid_disks)
+ if (rdev->new_raid_disk >= mddev->raid_disks)
rdev->new_raid_disk = -1;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
@@ -3736,6 +3753,8 @@ action_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", type);
}
+static void reap_sync_thread(mddev_t *mddev);
+
static ssize_t
action_store(mddev_t *mddev, const char *page, size_t len)
{
@@ -3750,9 +3769,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- mddev->recovery = 0;
+ reap_sync_thread(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3921,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
static ssize_t
sync_completed_show(mddev_t *mddev, char *page)
{
- unsigned long max_sectors, resync;
+ unsigned long long max_sectors, resync;
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
@@ -3915,7 +3932,7 @@ sync_completed_show(mddev_t *mddev, char *page)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync_completed;
- return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+ return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4019,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_lo;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if (new >= mddev->suspend_hi ||
- (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
- mddev->suspend_lo = new;
+
+ mddev->suspend_lo = new;
+ if (new >= old)
+ /* Shrinking suspended region */
mddev->pers->quiesce(mddev, 2);
- return len;
- } else
- return -EINVAL;
+ else {
+ /* Expanding suspended region - need to wait */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4053,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_hi;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
- (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
- mddev->suspend_hi = new;
+
+ mddev->suspend_hi = new;
+ if (new <= old)
+ /* Shrinking suspended region */
+ mddev->pers->quiesce(mddev, 2);
+ else {
+ /* Expanding suspended region - need to wait */
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
- return len;
- } else
- return -EINVAL;
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4422,7 +4448,9 @@ int md_run(mddev_t *mddev)
* We don't want the data to overlap the metadata,
* Internal Bitmap issues have been handled elsewhere.
*/
- if (rdev->data_offset < rdev->sb_start) {
+ if (rdev->meta_bdev) {
+ /* Nothing to check */;
+ } else if (rdev->data_offset < rdev->sb_start) {
if (mddev->dev_sectors &&
rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) {
@@ -4556,7 +4584,8 @@ int md_run(mddev_t *mddev)
mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
-
+ smp_wmb();
+ mddev->ready = 1;
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0) {
char nm[20];
@@ -4693,13 +4722,12 @@ static void md_clean(mddev_t *mddev)
mddev->plug = NULL;
}
-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
{
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
+ reap_sync_thread(mddev);
}
del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4741,18 @@ void md_stop_writes(mddev_t *mddev)
md_update_sb(mddev, 1);
}
}
+
+void md_stop_writes(mddev_t *mddev)
+{
+ mddev_lock(mddev);
+ __md_stop_writes(mddev);
+ mddev_unlock(mddev);
+}
EXPORT_SYMBOL_GPL(md_stop_writes);
void md_stop(mddev_t *mddev)
{
+ mddev->ready = 0;
mddev->pers->stop(mddev);
if (mddev->pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4772,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
goto out;
}
if (mddev->pers) {
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
err = -ENXIO;
if (mddev->ro==1)
@@ -4773,7 +4809,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
if (mddev->ro)
set_disk_ro(disk, 0);
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->unplug_fn = NULL;
@@ -5151,9 +5187,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
/* set saved_raid_disk if appropriate */
if (!mddev->persistent) {
if (info->state & (1<<MD_DISK_SYNC) &&
- info->raid_disk < mddev->raid_disks)
+ info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
- else
+ set_bit(In_sync, &rdev->flags);
+ } else
rdev->raid_disk = -1;
} else
super_types[mddev->major_version].
@@ -5230,7 +5267,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
printk(KERN_INFO "md: nonpersistent superblock ...\n");
rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
} else
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
rdev->sectors = rdev->sb_start;
err = bind_rdev_to_array(rdev, mddev);
@@ -5297,7 +5334,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}
if (mddev->persistent)
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
else
rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
@@ -5510,7 +5547,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
* sb_start or, if that is <data_offset, it must fit before the size
* of each device. If num_sectors is zero, we find the largest size
* that fits.
-
*/
if (mddev->sync_thread)
return -EBUSY;
@@ -5547,6 +5583,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
mddev->delta_disks = raid_disks - mddev->raid_disks;
rv = mddev->pers->check_reshape(mddev);
+ if (rv < 0)
+ mddev->delta_disks = 0;
return rv;
}
@@ -6033,7 +6071,8 @@ static int md_thread(void * arg)
|| kthread_should_stop(),
thread->timeout);
- if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+ if (!kthread_should_stop())
thread->run(thread->mddev);
}
@@ -6799,7 +6838,7 @@ void md_do_sync(mddev_t *mddev)
desc, mdname(mddev));
mddev->curr_resync = j;
}
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = j;
while (j < max_sectors) {
sector_t sectors;
@@ -6817,8 +6856,7 @@ void md_do_sync(mddev_t *mddev)
md_unplug(mddev);
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
- mddev->curr_resync_completed =
- mddev->curr_resync;
+ mddev->curr_resync_completed = j;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -6954,9 +6992,6 @@ void md_do_sync(mddev_t *mddev)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
mddev->curr_resync = 0;
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
- mddev->curr_resync_completed = 0;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -6997,7 +7032,7 @@ static int remove_and_add_spares(mddev_t *mddev)
}
}
- if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
+ if (mddev->degraded && !mddev->recovery_disabled) {
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
@@ -7023,6 +7058,45 @@ static int remove_and_add_spares(mddev_t *mddev)
}
return spares;
}
+
+static void reap_sync_thread(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ if (mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+ }
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
+ md_update_sb(mddev, 1);
+
+ /* if array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped
+ */
+ if (!mddev->degraded)
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev->saved_raid_disk = -1;
+
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+}
+
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -7047,9 +7121,6 @@ static int remove_and_add_spares(mddev_t *mddev)
*/
void md_check_recovery(mddev_t *mddev)
{
- mdk_rdev_t *rdev;
-
-
if (mddev->bitmap)
bitmap_daemon_work(mddev);
@@ -7084,7 +7155,20 @@ void md_check_recovery(mddev_t *mddev)
/* Only thing we do on a ro array is remove
* failed devices.
*/
- remove_and_add_spares(mddev);
+ mdk_rdev_t *rdev;
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Blocked, &rdev->flags) &&
+ test_bit(Faulty, &rdev->flags) &&
+ atomic_read(&rdev->nr_pending)==0) {
+ if (mddev->pers->hot_remove_disk(
+ mddev, rdev->raid_disk)==0) {
+ char nm[20];
+ sprintf(nm,"rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
@@ -7117,34 +7201,7 @@ void md_check_recovery(mddev_t *mddev)
goto unlock;
}
if (mddev->sync_thread) {
- /* resync has finished, collect result */
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* success...*/
- /* activate any spares */
- if (mddev->pers->spare_active(mddev))
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
- }
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
- mddev->pers->finish_reshape(mddev);
- md_update_sb(mddev, 1);
-
- /* if array is no-longer degraded, then any saved_raid_disk
- * information must be scrapped
- */
- if (!mddev->degraded)
- list_for_each_entry(rdev, &mddev->disks, same_set)
- rdev->saved_raid_disk = -1;
-
- mddev->recovery = 0;
- /* flag recovery needed just to double check */
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- md_new_event(mddev);
+ reap_sync_thread(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
@@ -7202,7 +7259,11 @@ void md_check_recovery(mddev_t *mddev)
" thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- mddev->recovery = 0;
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
} else
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d05bab55df4..7e90b8593b2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -60,6 +60,12 @@ struct mdk_rdev_s
mddev_t *mddev; /* RAID array if running */
int last_events; /* IO event timestamp */
+ /*
+ * If meta_bdev is non-NULL, it means that a separate device is
+ * being used to store the metadata (superblock/bitmap) which
+ * would otherwise be contained on the same device as the data (bdev).
+ */
+ struct block_device *meta_bdev;
struct block_device *bdev; /* block device handle */
struct page *sb_page;
@@ -87,8 +93,6 @@ struct mdk_rdev_s
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
-#define AllReserved 6 /* If whole device is reserved for
- * one array */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occured on an externally
* managed array, don't allow writes
@@ -148,7 +152,8 @@ struct mddev_s
* are happening, so run/
* takeover/stop are not safe
*/
-
+ int ready; /* See when safe to pass
+ * IO requests down */
struct gendisk *gendisk;
struct kobject kobj;
@@ -497,8 +502,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
- struct page *page, int rw);
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+ struct page *page, int rw, bool metadata_op);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a39f4c355e5..637a96855ed 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -179,6 +179,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
rdev1->new_raid_disk = j;
}
+ if (mddev->level == 1) {
+ /* taiking over a raid1 array-
+ * we have only one active disk
+ */
+ j = 0;
+ rdev1->new_raid_disk = j;
+ }
+
if (j < 0 || j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
"aborting!\n", mdname(mddev), j);
@@ -644,12 +652,38 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
return priv_conf;
}
+static void *raid0_takeover_raid1(mddev_t *mddev)
+{
+ raid0_conf_t *priv_conf;
+
+ /* Check layout:
+ * - (N - 1) mirror drives must be already faulty
+ */
+ if ((mddev->raid_disks - 1) != mddev->degraded) {
+ printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Set new parameters */
+ mddev->new_level = 0;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
+ mddev->delta_disks = 1 - mddev->raid_disks;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+
+ create_strip_zones(mddev, &priv_conf);
+ return priv_conf;
+}
+
static void *raid0_takeover(mddev_t *mddev)
{
/* raid0 can take over:
* raid4 - if all data disks are active.
* raid5 - providing it is Raid4 layout and one disk is faulty
* raid10 - assuming we have all necessary active disks
+ * raid1 - with (N -1) mirror drives faulty
*/
if (mddev->level == 4)
return raid0_takeover_raid45(mddev);
@@ -665,6 +699,12 @@ static void *raid0_takeover(mddev_t *mddev)
if (mddev->level == 10)
return raid0_takeover_raid10(mddev);
+ if (mddev->level == 1)
+ return raid0_takeover_raid1(mddev);
+
+ printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+ mddev->level);
+
return ERR_PTR(-EINVAL);
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 845cf95b612..a23ffa397ba 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
@@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- READ)) {
+ READ, false)) {
success = 1;
break;
}
@@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
rdev = conf->mirrors[d].rdev;
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- WRITE) == 0)
+ WRITE, false) == 0)
md_error(mddev, rdev);
}
d = start;
@@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- READ) == 0)
+ READ, false) == 0)
md_error(mddev, rdev);
}
} else {
@@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
- sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ))
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
success = 1;
else {
d++;
@@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, WRITE, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
@@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0641674827f..3b607b28741 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid10:%s: Disk failure on %s, disabling device.\n"
+ "md/raid10:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
@@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
+ sect,
s<<9,
- conf->tmppage, READ);
+ conf->tmppage, READ, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
if (success)
@@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ sect,
+ s<<9, conf->tmppage, WRITE, false)
== 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
@@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
+ sect,
s<<9, conf->tmppage,
- READ) == 0) {
+ READ, false) == 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
"md/raid10:%s: unable to read back "
@@ -2462,11 +2463,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
mddev->recovery_cp = MaxSector;
conf = setup_conf(mddev);
- if (!IS_ERR(conf))
+ if (!IS_ERR(conf)) {
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0)
rdev->new_raid_disk = rdev->raid_disk * 2;
-
+ conf->barrier = 1;
+ }
+
return conf;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc574f303f8..70281282419 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(Faulty, &rdev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT
"md/raid:%s: Operation continuing on %d devices.\n",
mdname(mddev),
bdevname(rdev->bdev, b),
@@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+ mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev)
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
- sysfs_notify_dirent(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -5518,7 +5517,6 @@ static int raid5_start_reshape(mddev_t *mddev)
raid5_conf_t *conf = mddev->private;
mdk_rdev_t *rdev;
int spares = 0;
- int added_devices = 0;
unsigned long flags;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -5528,8 +5526,8 @@ static int raid5_start_reshape(mddev_t *mddev)
return -ENOSPC;
list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk < 0 &&
- !test_bit(Faulty, &rdev->flags))
+ if (!test_bit(In_sync, &rdev->flags)
+ && !test_bit(Faulty, &rdev->flags))
spares++;
if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5572,29 +5570,35 @@ static int raid5_start_reshape(mddev_t *mddev)
* to correctly record the "partially reconstructed" state of
* such devices during the reshape and confusion could result.
*/
- if (mddev->delta_disks >= 0)
- list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk < 0 &&
- !test_bit(Faulty, &rdev->flags)) {
- if (raid5_add_disk(mddev, rdev) == 0) {
- char nm[20];
- if (rdev->raid_disk >= conf->previous_raid_disks) {
- set_bit(In_sync, &rdev->flags);
- added_devices++;
- } else
- rdev->recovery_offset = 0;
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&mddev->kobj,
- &rdev->kobj, nm))
- /* Failure here is OK */;
- } else
- break;
- }
+ if (mddev->delta_disks >= 0) {
+ int added_devices = 0;
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk < 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ if (raid5_add_disk(mddev, rdev) == 0) {
+ char nm[20];
+ if (rdev->raid_disk
+ >= conf->previous_raid_disks) {
+ set_bit(In_sync, &rdev->flags);
+ added_devices++;
+ } else
+ rdev->recovery_offset = 0;
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ if (sysfs_create_link(&mddev->kobj,
+ &rdev->kobj, nm))
+ /* Failure here is OK */;
+ }
+ } else if (rdev->raid_disk >= conf->previous_raid_disks
+ && !test_bit(Faulty, &rdev->flags)) {
+ /* This is a spare that was manually added */
+ set_bit(In_sync, &rdev->flags);
+ added_devices++;
+ }
- /* When a reshape changes the number of devices, ->degraded
- * is measured against the larger of the pre and post number of
- * devices.*/
- if (mddev->delta_disks > 0) {
+ /* When a reshape changes the number of devices,
+ * ->degraded is measured against the larger of the
+ * pre and post number of devices.
+ */
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
- added_devices;