diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 24 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 12 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 618 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 111 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 57 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-base.c | 139 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-transfer.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 67 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 697 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 19 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 62 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 27 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 20 | ||||
-rw-r--r-- | drivers/md/dm.c | 23 | ||||
-rw-r--r-- | drivers/md/md.c | 245 | ||||
-rw-r--r-- | drivers/md/md.h | 15 | ||||
-rw-r--r-- | drivers/md/raid0.c | 40 | ||||
-rw-r--r-- | drivers/md/raid1.c | 33 | ||||
-rw-r--r-- | drivers/md/raid10.c | 23 | ||||
-rw-r--r-- | drivers/md/raid5.c | 62 |
24 files changed, 1854 insertions, 450 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf1a95e3155..98d9ec85e0e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -240,6 +240,30 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_RAID + tristate "RAID 4/5/6 target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID456 + select BLK_DEV_MD + ---help--- + A dm target that supports RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + config DM_LOG_USERSPACE tristate "Mirror userspace logging (EXPERIMENTAL)" depends on DM_MIRROR && EXPERIMENTAL && NET diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5e3aac41919..d0138606c2e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_RAID) += dm-raid.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a1ffe3527a..9a35320fb59 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, || test_bit(Faulty, &rdev->flags)) continue; - target = rdev->sb_start + offset + index * (PAGE_SIZE/512); + target = offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), - page, READ)) { + page, READ, true)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will * quietly no-op */ @@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev = NULL; + struct block_device *bdev; mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); + bdev_logical_block_size(bdev)); /* Just make sure we aren't corrupting data or * metadata */ @@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); - bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d5b0e4c0e70..4e054bd9166 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,10 +18,14 @@ #include <linux/crypto.h> #include <linux/workqueue.h> #include <linux/backing-dev.h> +#include <linux/percpu.h> #include <asm/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> #include <asm/unaligned.h> +#include <crypto/hash.h> +#include <crypto/md5.h> +#include <crypto/algapi.h> #include <linux/device-mapper.h> @@ -63,6 +67,7 @@ struct dm_crypt_request { struct convert_context *ctx; struct scatterlist sg_in; struct scatterlist sg_out; + sector_t iv_sector; }; struct crypt_config; @@ -73,11 +78,13 @@ struct crypt_iv_operations { void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); int (*wipe)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); }; struct iv_essiv_private { - struct crypto_cipher *tfm; struct crypto_hash *hash_tfm; u8 *salt; }; @@ -86,11 +93,32 @@ struct iv_benbi_private { int shift; }; +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; + +/* + * Duplicated per-CPU state for cipher. + */ +struct crypt_cpu { + struct ablkcipher_request *req; + /* ESSIV: struct crypto_cipher *essiv_tfm */ + void *iv_private; + struct crypto_ablkcipher *tfms[0]; +}; + +/* + * The fields in here must be read only after initialization, + * changing state should be in crypt_cpu. + */ struct crypt_config { struct dm_dev *dev; sector_t start; @@ -108,17 +136,25 @@ struct crypt_config { struct workqueue_struct *crypt_queue; char *cipher; - char *cipher_mode; + char *cipher_string; struct crypt_iv_operations *iv_gen_ops; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; + struct iv_lmk_private lmk; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; /* + * Duplicated per cpu state. Access through + * per_cpu_ptr() only. + */ + struct crypt_cpu __percpu *cpu; + unsigned tfms_count; + + /* * Layout of each crypto request: * * struct ablkcipher_request @@ -132,11 +168,10 @@ struct crypt_config { * correctly aligned. */ unsigned int dmreq_start; - struct ablkcipher_request *req; - struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; + unsigned int key_parts; u8 key[0]; }; @@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); + +static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) +{ + return this_cpu_ptr(cc->cpu); +} + +/* + * Use this to access cipher attributes that are the same for each CPU. + */ +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +{ + return __this_cpu_ptr(cc->cpu)->tfms[0]; +} /* * Different IV generation algorithms: @@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * null: the initial vector is always zero. Provides compatibility with * obsolete loop_fish2 devices. Do not use for new devices. * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); + *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, - sector_t sector) + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; struct hash_desc desc; struct scatterlist sg; - int err; + struct crypto_cipher *essiv_tfm; + int err, cpu; sg_init_one(&sg, cc->key, cc->key_size); desc.tfm = essiv->hash_tfm; @@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) if (err) return err; - return crypto_cipher_setkey(essiv->tfm, essiv->salt, + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, + + err = crypto_cipher_setkey(essiv_tfm, essiv->salt, crypto_hash_digestsize(essiv->hash_tfm)); + if (err) + return err; + } + + return 0; } /* Wipe salt and reset key derived from volume key */ @@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + struct crypto_cipher *essiv_tfm; + int cpu, r, err = 0; memset(essiv->salt, 0, salt_size); - return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; + r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); + if (r) + err = r; + } + + return err; +} + +/* Set up per cpu cipher state */ +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, + struct dm_target *ti, + u8 *salt, unsigned saltsize) +{ + struct crypto_cipher *essiv_tfm; + int err; + + /* Setup the essiv_tfm with the given salt */ + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(essiv_tfm)) { + ti->error = "Error allocating crypto tfm for ESSIV"; + return essiv_tfm; + } + + if (crypto_cipher_blocksize(essiv_tfm) != + crypto_ablkcipher_ivsize(any_tfm(cc))) { + ti->error = "Block size of ESSIV cipher does " + "not match IV size of block cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(-EINVAL); + } + + err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); + if (err) { + ti->error = "Failed to set key for ESSIV cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(err); + } + + return essiv_tfm; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { + int cpu; + struct crypt_cpu *cpu_cc; + struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_cipher(essiv->tfm); - essiv->tfm = NULL; - crypto_free_hash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); essiv->salt = NULL; + + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + essiv_tfm = cpu_cc->iv_private; + + if (essiv_tfm) + crypto_free_cipher(essiv_tfm); + + cpu_cc->iv_private = NULL; + } } static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, struct crypto_cipher *essiv_tfm = NULL; struct crypto_hash *hash_tfm = NULL; u8 *salt = NULL; - int err; + int err, cpu; if (!opts) { ti->error = "Digest algorithm missing for ESSIV mode"; @@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, goto bad; } - /* Allocate essiv_tfm */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(essiv_tfm)) { - ti->error = "Error allocating crypto tfm for ESSIV"; - err = PTR_ERR(essiv_tfm); - goto bad; - } - if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(cc->tfm)) { - ti->error = "Block size of ESSIV cipher does " - "not match IV size of block cipher"; - err = -EINVAL; - goto bad; - } - cc->iv_gen_private.essiv.salt = salt; - cc->iv_gen_private.essiv.tfm = essiv_tfm; cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + for_each_possible_cpu(cpu) { + essiv_tfm = setup_essiv_cpu(cc, ti, salt, + crypto_hash_digestsize(hash_tfm)); + if (IS_ERR(essiv_tfm)) { + crypt_iv_essiv_dtr(cc); + return PTR_ERR(essiv_tfm); + } + per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; + } + return 0; bad: - if (essiv_tfm && !IS_ERR(essiv_tfm)) - crypto_free_cipher(essiv_tfm); if (hash_tfm && !IS_ERR(hash_tfm)) crypto_free_hash(hash_tfm); kfree(salt); return err; } -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { + struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; + memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + crypto_cipher_encrypt_one(essiv_tfm, iv, iv); + return 0; } static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) { } -static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { __be64 val; memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; } -static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); return 0; } +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kzfree(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(lmk->hash_tfm)]; + } sdesc; + struct md5_state md5state; + u32 buf[4]; + int i, r; + + sdesc.desc.tfm = lmk->hash_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(&sdesc.desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(&sdesc.desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src, KM_USER0); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + + kunmap_atomic(dst, KM_USER0); + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; +static struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); } +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); +} + static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, struct ablkcipher_request *req) @@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc, int r = 0; dmreq = dmreq_of_req(cc, req); - iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(cc->tfm) + 1); + iv = iv_of_dmreq(cc, dmreq); + dmreq->iv_sector = ctx->sector; dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, @@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc, } if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + r = cc->iv_gen_ops->generator(cc, iv, dmreq); if (r < 0) return r; } @@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc, else r = crypto_ablkcipher_decrypt(req); + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, iv, dmreq); + return r; } static void kcryptd_async_done(struct crypto_async_request *async_req, int error); + static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - if (!cc->req) - cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(cc->req, cc->tfm); - ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, - dmreq_of_req(cc, cc->req)); + struct crypt_cpu *this_cc = this_crypt_config(cc); + unsigned key_index = ctx->sector & (cc->tfms_count - 1); + + if (!this_cc->req) + this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + + ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); + ablkcipher_request_set_callback(this_cc->req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); } /* @@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { + struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->pending, 1); @@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->pending); - r = crypt_convert_block(cc, ctx, cc->req); + r = crypt_convert_block(cc, ctx, this_cc->req); switch (r) { /* async */ @@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc, INIT_COMPLETION(ctx->restart); /* fall through*/ case -EINPROGRESS: - cc->req = NULL; + this_cc->req = NULL; ctx->sector++; continue; @@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * They must be separated as otherwise the final stages could be * starved by new requests which can block in the first stages due * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. */ static void crypt_endio(struct bio *clone, int error) { @@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void kcryptd_io_read(struct dm_crypt_io *io) +static void kcryptd_unplug(struct crypt_config *cc) +{ + blk_unplug(bdev_get_queue(cc->dev->bdev)); +} + +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { struct crypt_config *cc = io->target->private; struct bio *base_bio = io->base_bio; struct bio *clone; - crypt_inc_pending(io); - /* * The block layer might modify the bvec array, so always * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); - if (unlikely(!clone)) { - io->error = -ENOMEM; - crypt_dec_pending(io); - return; + clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + if (!clone) { + kcryptd_unplug(cc); + return 1; } + crypt_inc_pending(io); + clone_init(io, clone); clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); @@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io) sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); + return 0; } static void kcryptd_io_write(struct dm_crypt_io *io) @@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_io_read(io); - else + if (bio_data_dir(io->base_bio) == READ) { + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); + } else kcryptd_io_write(io); } @@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, return; } + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); if (!atomic_dec_and_test(&ctx->pending)) @@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) } } -static int crypt_set_key(struct crypt_config *cc, char *key) +static void crypt_free_tfms(struct crypt_config *cc, int cpu) { - unsigned key_size = strlen(key) >> 1; + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; - if (cc->key_size && cc->key_size != key_size) + for (i = 0; i < cc->tfms_count; i++) + if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { + crypto_free_ablkcipher(cpu_cc->tfms[i]); + cpu_cc->tfms[i] = NULL; + } +} + +static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + int err; + + for (i = 0; i < cc->tfms_count; i++) { + cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + if (IS_ERR(cpu_cc->tfms[i])) { + err = PTR_ERR(cpu_cc->tfms[i]); + crypt_free_tfms(cc, cpu); + return err; + } + } + + return 0; +} + +static int crypt_setkey_allcpus(struct crypt_config *cc) +{ + unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + int cpu, err = 0, i, r; + + for_each_possible_cpu(cpu) { + for (i = 0; i < cc->tfms_count; i++) { + r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], + cc->key + (i * subkey_size), subkey_size); + if (r) + err = r; + } + } + + return err; +} + +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + /* The key size may not be changed. */ + if (cc->key_size != (strlen(key) >> 1)) return -EINVAL; - cc->key_size = key_size; /* initial settings */ + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + return -EINVAL; - if ((!key_size && strcmp(key, "-")) || - (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) return -EINVAL; set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + return crypt_setkey_allcpus(cc); } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + + return crypt_setkey_allcpus(cc); } static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; + struct crypt_cpu *cpu_cc; + int cpu; ti->private = NULL; @@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); + if (cc->cpu) + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + if (cpu_cc->req) + mempool_free(cpu_cc->req, cc->req_pool); + crypt_free_tfms(cc, cpu); + } + if (cc->bs) bioset_free(cc->bs); @@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti) if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); - if (cc->tfm && !IS_ERR(cc->tfm)) - crypto_free_ablkcipher(cc->tfm); - if (cc->dev) dm_put_device(ti, cc->dev); + if (cc->cpu) + free_percpu(cc->cpu); + kzfree(cc->cipher); - kzfree(cc->cipher_mode); + kzfree(cc->cipher_string); /* Must zero key material before freeing */ kzfree(cc); @@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) { struct crypt_config *cc = ti->private; - char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; char *cipher_api = NULL; - int ret = -EINVAL; + int cpu, ret = -EINVAL; /* Convert to crypto api definition? */ if (strchr(cipher_in, '(')) { @@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti, return -EINVAL; } + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) + goto bad_mem; + /* * Legacy dm-crypt cipher specification - * cipher-mode-iv:ivopts + * cipher[:keycount]-mode-iv:ivopts */ tmp = cipher_in; - cipher = strsep(&tmp, "-"); + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; cc->cipher = kstrdup(cipher, GFP_KERNEL); if (!cc->cipher) goto bad_mem; - if (tmp) { - cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); - if (!cc->cipher_mode) - goto bad_mem; - } - chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); @@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - /* Compatibility mode for old dm-crypt mappings */ + cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + + cc->tfms_count * sizeof(*(cc->cpu->tfms)), + __alignof__(struct crypt_cpu)); + if (!cc->cpu) { + ti->error = "Cannot allocate per cpu state"; + goto bad_mem; + } + + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { - kfree(cc->cipher_mode); - cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } @@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Allocate cipher */ - cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); - if (IS_ERR(cc->tfm)) { - ret = PTR_ERR(cc->tfm); - ti->error = "Error allocating crypto tfm"; - goto bad; + for_each_possible_cpu(cpu) { + ret = crypt_alloc_tfms(cc, cpu, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + goto bad; + } } /* Initialize and set key */ @@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_benbi_ops; else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; - else { + else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + */ + if (cc->key_size % cc->key_parts) + cc->key_parts++; + } else { ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; @@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate encryption context"; return -ENOMEM; } + cc->key_size = key_size; ti->private = cc; ret = crypt_ctr_cipher(ti, argv[0], argv[1]); @@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + @@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate crypt request mempool"; goto bad; } - cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { @@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->start = tmpll; ret = -ENOMEM; - cc->io_queue = create_singlethread_workqueue("kcryptd_io"); + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_NON_REENTRANT| + WQ_MEM_RECLAIM, + 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } - cc->crypt_queue = create_singlethread_workqueue("kcryptd"); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_NON_REENTRANT| + WQ_CPU_INTENSIVE| + WQ_MEM_RECLAIM, + 1); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_queue_io(io); - else + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, GFP_NOWAIT)) + kcryptd_queue_io(io); + } else kcryptd_queue_crypt(io); return DM_MAPIO_SUBMITTED; @@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->cipher_mode) - DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); - else - DMEMIT("%s ", cc->cipher); + DMEMIT("%s ", cc->cipher_string); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 7, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index baa11912cc9..f18375dcedd 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -352,7 +352,7 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = create_workqueue("kdelayd"); + kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!kdelayd_wq) { DMERR("Couldn't start kdelayd"); goto bad_queue; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4b54618b415..6d12775a106 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -295,19 +295,55 @@ retry: DMWARN("remove_all left %d open device(s)", dev_skipped); } +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + + /* + * Rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + return old_name; +} + static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, const char *new) { - char *new_name, *old_name; + char *new_data, *old_name = NULL; struct hash_cell *hc; struct dm_table *table; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; /* * duplicate new. */ - new_name = kstrdup(new, GFP_KERNEL); - if (!new_name) + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Is new free ? */ - hc = __get_name_cell(new); + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + if (hc) { - DMWARN("asked to rename to an already-existing name %s -> %s", + DMWARN("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", param->name, new); dm_put(hc->md); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-EBUSY); } @@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, */ hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non-existent device %s -> %s", - param->name, new); + DMWARN("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-ENXIO); } /* - * rename and move the name cell. + * Does this device already have a uuid? */ - list_del(&hc->name_list); - old_name = hc->name; - mutex_lock(&dm_hash_cells_mutex); - hc->name = new_name; - mutex_unlock(&dm_hash_cells_mutex); - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + if (change_uuid && hc->uuid) { + DMWARN("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); /* * Wake up any dm event waiters. @@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) */ r = dm_lock_for_deletion(md); if (r) { - DMWARN("unable to remove open device %s", hc->name); + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); return r; @@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end) static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; - char *new_name = (char *) param + param->data_start; + char *new_data = (char *) param + param->data_start; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - if (new_name < param->data || - invalid_str(new_name, (void *) param + param_size) || - strlen(new_name) > DM_NAME_LEN - 1) { - DMWARN("Invalid new logical volume name supplied."); + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMWARN("Invalid new mapped device name or uuid string supplied."); return -EINVAL; } - r = check_name(new_name); - if (r) - return r; + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } - md = dm_hash_rename(param, new_name); + md = dm_hash_rename(param, new_data); if (IS_ERR(md)) return PTR_ERR(md); @@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d8587bac568..924f5f0084c 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -37,6 +37,13 @@ struct dm_kcopyd_client { unsigned int nr_pages; unsigned int nr_free_pages; + /* + * Block devices to unplug. + * Non-NULL pointer means that a block device has some pending requests + * and needs to be unplugged. + */ + struct block_device *unplug[2]; + struct dm_io_client *io_client; wait_queue_head_t destroyq; @@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job) return 0; } +/* + * Unplug the block device at the specified index. + */ +static void unplug(struct dm_kcopyd_client *kc, int rw) +{ + if (kc->unplug[rw] != NULL) { + blk_unplug(bdev_get_queue(kc->unplug[rw])); + kc->unplug[rw] = NULL; + } +} + +/* + * Prepare block device unplug. If there's another device + * to be unplugged at the same array index, we unplug that + * device first. + */ +static void prepare_unplug(struct dm_kcopyd_client *kc, int rw, + struct block_device *bdev) +{ + if (likely(kc->unplug[rw] == bdev)) + return; + unplug(kc, rw); + kc->unplug[rw] = bdev; +} + static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; @@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, + .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, @@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job) .client = job->kc->io_client, }; - if (job->rw == READ) + if (job->rw == READ) { r = dm_io(&io_req, 1, &job->source, NULL); - else + prepare_unplug(job->kc, READ, job->source.bdev); + } else { + if (job->num_dests > 1) + io_req.bi_rw |= REQ_UNPLUG; r = dm_io(&io_req, job->num_dests, job->dests, NULL); + if (!(io_req.bi_rw & REQ_UNPLUG)) + prepare_unplug(job->kc, WRITE, job->dests[0].bdev); + } return r; } @@ -435,10 +473,18 @@ static void do_work(struct work_struct *work) * Pages jobs when successful will jump onto the io jobs * list. io jobs call wake when they complete and it all * starts again. + * + * Note that io_jobs add block devices to the unplug array, + * this array is cleared with "unplug" calls. It is thus + * forbidden to run complete_jobs after io_jobs and before + * unplug because the block device could be destroyed in + * job completion callback. */ process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->io_jobs, kc, run_io_job); + unplug(kc, READ); + unplug(kc, WRITE); } /* @@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages, INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); + memset(kc->unplug, 0, sizeof(kc->unplug)); + kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); if (!kc->job_pool) goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + kc->kcopyd_wq = alloc_workqueue("kcopyd", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!kc->kcopyd_wq) goto bad_workqueue; diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 1ed0094f064..aa2e0c374ab 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -12,12 +12,22 @@ #include "dm-log-userspace-transfer.h" +#define DM_LOG_USERSPACE_VSN "1.1.0" + struct flush_entry { int type; region_t region; struct list_head list; }; +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + struct log_c { struct dm_target *ti; uint32_t region_size; @@ -37,8 +47,15 @@ struct log_c { */ uint64_t in_sync_hint; + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ spinlock_t flush_lock; - struct list_head flush_list; /* only for clear and mark requests */ + struct list_head mark_list; + struct list_head clear_list; }; static mempool_t *flush_entry_pool; @@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, strncpy(lc->uuid, argv[0], DM_UUID_LEN); spin_lock_init(&lc->flush_lock); - INIT_LIST_HEAD(&lc->flush_list); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); if (str_size < 0) { @@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, ctr_str, str_size, NULL, NULL); - if (r == -ESRCH) { - DMERR("Userspace log server not found"); + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); goto out; } @@ -214,10 +235,9 @@ out: static void userspace_dtr(struct dm_dirty_log *log) { - int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, NULL, 0, NULL, NULL); @@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region, return (r) ? 0 : (int)in_sync; } +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + int count; + uint32_t type = 0; + struct flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_del(&fe->list); + list_add(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* Group send failed. Attempt one-by-one. */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + /* * userspace_flush * @@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log) int r = 0; unsigned long flags; struct log_c *lc = log->context; - LIST_HEAD(flush_list); + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); struct flush_entry *fe, *tmp_fe; spin_lock_irqsave(&lc->flush_lock, flags); - list_splice_init(&lc->flush_list, &flush_list); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); - if (list_empty(&flush_list)) + if (list_empty(&mark_list) && list_empty(&clear_list)) return 0; - /* - * FIXME: Count up requests, group request types, - * allocate memory to stick all requests in and - * send to server in one go. Failing the allocation, - * do it one by one. - */ + r = flush_by_group(lc, &mark_list); + if (r) + goto fail; - list_for_each_entry(fe, &flush_list, list) { - r = userspace_do_request(lc, lc->uuid, fe->type, - (char *)&fe->region, - sizeof(fe->region), - NULL, NULL); - if (r) - goto fail; - } + r = flush_by_group(lc, &clear_list); + if (r) + goto fail; r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); @@ -395,7 +474,11 @@ fail: * Calling code will receive an error and will know that * the log facility has failed. */ - list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { list_del(&fe->list); mempool_free(fe, flush_entry_pool); } @@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_MARK_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->mark_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_CLEAR_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void) return r; } - DMINFO("version 1.0.0 loaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); return 0; } @@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void) dm_ulog_tfr_exit(); mempool_destroy(flush_entry_pool); - DMINFO("version 1.0.0 unloaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); return; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 075cbcf8a9f..049eaf12aaa 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -198,6 +198,7 @@ resend: memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 33420e68d15..6951536ea29 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, r = PTR_ERR(lc->io_req.client); DMWARN("couldn't allocate disk io client"); kfree(lc); - return -ENOMEM; + return r; } lc->disk_header = vmalloc(buf_size); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 487ecda90ad..b82d28819e2 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,8 @@ #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) /* Path properties */ struct pgpath { @@ -33,8 +35,7 @@ struct pgpath { unsigned fail_count; /* Cumulative failure count */ struct dm_path path; - struct work_struct deactivate_path; - struct work_struct activate_path; + struct delayed_work activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -65,11 +66,15 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; + unsigned nr_priority_groups; struct list_head priority_groups; + + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; @@ -82,6 +87,7 @@ struct multipath { unsigned saved_queue_if_no_path;/* Saved state during suspension */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ + unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ struct work_struct process_queued_ios; struct list_head queued_ios; @@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static void deactivate_path(struct work_struct *work); /*----------------------------------------------- @@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; - INIT_WORK(&pgpath->deactivate_path, deactivate_path); - INIT_WORK(&pgpath->activate_path, activate_path); + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath) kfree(pgpath); } -static void deactivate_path(struct work_struct *work) -{ - struct pgpath *pgpath = - container_of(work, struct pgpath, deactivate_path); - - blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue); -} - static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; @@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) INIT_LIST_HEAD(&m->queued_ios); spin_lock_init(&m->lock); m->queue_io = 1; + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); @@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m) static void __pg_init_all_paths(struct multipath *m) { struct pgpath *pgpath; + unsigned long pg_init_delay = 0; m->pg_init_count++; m->pg_init_required = 0; + if (m->pg_init_delay_retry) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { /* Skip failed paths */ if (!pgpath->is_active) continue; - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) m->pg_init_in_progress++; } } @@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m) const char *param_name; static struct param _params[] = { - {0, 3, "invalid number of feature args"}, + {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; r = read_param(_params, shift(as), &argc, &ti->error); @@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m) continue; } + if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + (argc >= 1)) { + r = read_param(_params + 2, shift(as), + &m->pg_init_delay_msecs, &ti->error); + argc--; + continue; + } + ti->error = "Unrecognised multipath feature request"; r = -EINVAL; } while (argc && !r); @@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m) flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); - flush_scheduled_work(); + flush_work_sync(&m->trigger_event); } static void multipath_dtr(struct dm_target *ti) @@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath) pgpath->path.dev->name, m->nr_valid_paths); schedule_work(&m->trigger_event); - queue_work(kmultipathd, &pgpath->deactivate_path); out: spin_unlock_irqrestore(&m->lock, flags); @@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath) m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) m->pg_init_in_progress++; } @@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; + unsigned delay_retry = 0; /* device or driver problems */ switch (errors) { @@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors) */ bypass_pg(m, pg, 1); break; - /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = 1; case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: if (pg_init_limit_reached(m, pgpath)) @@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors) if (!m->pg_init_required) m->queue_io = 0; + m->pg_init_delay_retry = delay_retry; queue_work(kmultipathd, &m->process_queued_ios); /* @@ -1241,7 +1254,7 @@ out: static void activate_path(struct work_struct *work) { struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path); + container_of(work, struct pgpath, activate_path.work); scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), pg_init_done, pgpath); @@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); else { DMEMIT("%u ", m->queue_if_no_path + - (m->pg_init_retries > 0) * 2); + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); if (m->queue_if_no_path) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); } if (!m->hw_handler_name || type == STATUSTYPE_INFO) @@ -1655,7 +1671,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void) return -EINVAL; } - kmultipathd = create_workqueue("kmpathd"); + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); dm_unregister_target(&multipath_target); @@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void) * old workqueue would also create a bottleneck in the * path of the storage hardware device activation. */ - kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); destroy_workqueue(kmultipathd); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 00000000000..b9e1e15ef11 --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,697 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/slab.h> + +#include "md.h" +#include "raid5.h" +#include "dm.h" +#include "bitmap.h" + +#define DM_MSG_PREFIX "raid" + +/* + * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then + * make it so the flag doesn't set anything. + */ +#ifndef MD_SYNC_STATE_FORCED +#define MD_SYNC_STATE_FORCED 0 +#endif + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct mdk_rdev_s rdev; +}; + +/* + * Flags for rs->print_flags field. + */ +#define DMPF_DAEMON_SLEEP 0x1 +#define DMPF_MAX_WRITE_BEHIND 0x2 +#define DMPF_SYNC 0x4 +#define DMPF_NOSYNC 0x8 +#define DMPF_STRIPE_CACHE 0x10 +#define DMPF_MIN_RECOVERY_RATE 0x20 +#define DMPF_MAX_RECOVERY_RATE 0x40 + +struct raid_set { + struct dm_target *ti; + + uint64_t print_flags; + + struct mddev_s md; + struct raid_type *raid_type; + struct dm_target_callbacks callbacks; + + struct raid_dev dev[0]; +}; + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const unsigned level; /* RAID level. */ + const unsigned algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} +}; + +static struct raid_type *get_raid_type(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raid_types); i++) + if (!strcmp(raid_types[i].name, name)) + return &raid_types[i]; + + return NULL; +} + +static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) +{ + unsigned i; + struct raid_set *rs; + sector_t sectors_per_dev; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + sectors_per_dev = ti->len; + if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + ti->error = "Target length not divisible by number of data devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->ti = ti; + rs->raid_type = raid_type; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.dev_sectors = sectors_per_dev; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = 0; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + */ + + return rs; +} + +static void context_free(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + + kfree(rs); +} + +/* + * For every device we have two words + * <meta_dev>: meta device name or '-' if missing + * <data_dev>: data device name or '-' if missing + * + * This code parses those words. + */ +static int dev_parms(struct raid_set *rs, char **argv) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int ret = 0; + + for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets, since there is a separate device + * for data and metadata. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + if (strcmp(argv[0], "-")) { + rs->ti->error = "Metadata devices not supported"; + return -EINVAL; + } + + if (!strcmp(argv[1], "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + continue; + } + + ret = dm_get_device(rs->ti, argv[1], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (ret) { + rs->ti->error = "RAID device lookup failure"; + return ret; + } + + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + DMERR("Unable to rebuild drive while array is not in-sync"); + rs->ti->error = "RAID device lookup failure"; + return -EINVAL; + } + + return 0; +} + +/* + * Possible arguments are... + * RAID456: + * <chunk_size> [optional_args] + * + * Optional args: + * [[no]sync] Force or prevent recovery of the entire array + * [rebuild <idx>] Rebuild the drive indicated by the index + * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits + * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + */ +static int parse_raid_params(struct raid_set *rs, char **argv, + unsigned num_raid_params) +{ + unsigned i, rebuild_cnt = 0; + unsigned long value; + char *key; + + /* + * First, parse the in-order required arguments + */ + if ((strict_strtoul(argv[0], 10, &value) < 0) || + !is_power_of_2(value) || (value < 8)) { + rs->ti->error = "Bad chunk size"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + argv++; + num_raid_params--; + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < rs->md.raid_disks; i++) + set_bit(In_sync, &rs->dev[i].rdev.flags); + + for (i = 0; i < num_raid_params; i++) { + if (!strcmp(argv[i], "nosync")) { + rs->md.recovery_cp = MaxSector; + rs->print_flags |= DMPF_NOSYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + if (!strcmp(argv[i], "sync")) { + rs->md.recovery_cp = 0; + rs->print_flags |= DMPF_SYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + + /* The rest of the optional arguments come in key/value pairs */ + if ((i + 1) >= num_raid_params) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + key = argv[i++]; + if (strict_strtoul(argv[i], 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + if (!strcmp(key, "rebuild")) { + if (++rebuild_cnt > rs->raid_type->parity_devs) { + rs->ti->error = "Too many rebuild drives given"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + clear_bit(In_sync, &rs->dev[value].rdev.flags); + rs->dev[value].rdev.recovery_offset = 0; + } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_MAX_WRITE_BEHIND; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + if (value > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + rs->md.bitmap_info.max_write_behind = value; + } else if (!strcmp(key, "daemon_sleep")) { + rs->print_flags |= DMPF_DAEMON_SLEEP; + if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcmp(key, "stripe_cache")) { + rs->print_flags |= DMPF_STRIPE_CACHE; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + + if (rs->raid_type->level < 5) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + if (raid5_set_cache_size(&rs->md, (int)value)) { + rs->ti->error = "Bad stripe_cache size"; + return -EINVAL; + } + } else if (!strcmp(key, "min_recovery_rate")) { + rs->print_flags |= DMPF_MIN_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = (int)value; + } else if (!strcmp(key, "max_recovery_rate")) { + rs->print_flags |= DMPF_MAX_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = (int)value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameters"; + return -EINVAL; + } + } + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + return 0; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + dm_table_event(rs->ti->table); +} + +static int raid_is_congested(struct dm_target_callbacks *cb, int bits) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + return md_raid5_congested(&rs->md, bits); +} + +static void raid_unplug(struct dm_target_callbacks *cb) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + md_raid5_unplug_device(rs->md.private); +} + +/* + * Construct a RAID4/5/6 mapping: + * Args: + * <raid_type> <#raid_params> <raid_params> \ + * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } + * + * ** metadata devices are not supported yet, use '-' instead ** + * + * <raid_params> varies by <raid_type>. See 'parse_raid_params' for + * details on possible <raid_params>. + */ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int ret; + struct raid_type *rt; + unsigned long num_raid_params, num_raid_devs; + struct raid_set *rs = NULL; + + /* Must have at least <raid_type> <#raid_params> */ + if (argc < 2) { + ti->error = "Too few arguments"; + return -EINVAL; + } + + /* raid type */ + rt = get_raid_type(argv[0]); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + argc--; + argv++; + + /* number of RAID parameters */ + if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + ti->error = "Cannot understand number of RAID parameters"; + return -EINVAL; + } + argc--; + argv++; + + /* Skip over RAID params for now and find out # of devices */ + if (num_raid_params + 1 > argc) { + ti->error = "Arguments do not agree with counts given"; + return -EINVAL; + } + + if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + (num_raid_devs >= INT_MAX)) { + ti->error = "Cannot understand number of raid devices"; + return -EINVAL; + } + + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); + if (ret) + goto bad; + + ret = -EINVAL; + + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + argv += num_raid_params + 1; + + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + goto bad; + } + + ret = dev_parms(rs, argv); + if (ret) + goto bad; + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->split_io = rs->md.chunk_sectors; + ti->private = rs; + + mutex_lock(&rs->md.reconfig_mutex); + ret = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + mutex_unlock(&rs->md.reconfig_mutex); + + if (ret) { + ti->error = "Fail to run raid array"; + goto bad; + } + + rs->callbacks.congested_fn = raid_is_congested; + rs->callbacks.unplug_fn = raid_unplug; + dm_table_add_target_callbacks(ti->table, &rs->callbacks); + + return 0; + +bad: + context_free(rs); + + return ret; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + list_del_init(&rs->callbacks.list); + md_stop(&rs->md); + context_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct raid_set *rs = ti->private; + mddev_t *mddev = &rs->md; + + mddev->pers->make_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int raid_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct raid_set *rs = ti->private; + unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned sz = 0; + int i; + sector_t sync; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); + + for (i = 0; i < rs->md.raid_disks; i++) { + if (test_bit(Faulty, &rs->dev[i].rdev.flags)) + DMEMIT("D"); + else if (test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT("A"); + else + DMEMIT("a"); + } + + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) + sync = rs->md.curr_resync_completed; + else + sync = rs->md.recovery_cp; + + if (sync > rs->md.resync_max_sectors) + sync = rs->md.resync_max_sectors; + + DMEMIT(" %llu/%llu", + (unsigned long long) sync, + (unsigned long long) rs->md.resync_max_sectors); + + break; + case STATUSTYPE_TABLE: + /* The string you would use to construct this array */ + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + raid_param_cnt++; /* for rebuilds */ + + raid_param_cnt += (hweight64(rs->print_flags) * 2); + if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) + raid_param_cnt--; + + DMEMIT("%s %u %u", rs->raid_type->name, + raid_param_cnt, rs->md.chunk_sectors); + + if ((rs->print_flags & DMPF_SYNC) && + (rs->md.recovery_cp == MaxSector)) + DMEMIT(" sync"); + if (rs->print_flags & DMPF_NOSYNC) + DMEMIT(" nosync"); + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT(" rebuild %u", i); + + if (rs->print_flags & DMPF_DAEMON_SLEEP) + DMEMIT(" daemon_sleep %lu", + rs->md.bitmap_info.daemon_sleep); + + if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) + DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); + + if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) + DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) + DMEMIT(" max_write_behind %lu", + rs->md.bitmap_info.max_write_behind); + + if (rs->print_flags & DMPF_STRIPE_CACHE) { + raid5_conf_t *conf = rs->md.private; + + /* convert from kiB to sectors */ + DMEMIT(" stripe_cache %d", + conf ? conf->max_nr_stripes * 2 : 0); + } + + DMEMIT(" %d", rs->md.raid_disks); + for (i = 0; i < rs->md.raid_disks; i++) { + DMEMIT(" -"); /* metadata device */ + + if (rs->dev[i].data_dev) + DMEMIT(" %s", rs->dev[i].data_dev->name); + else + DMEMIT(" -"); + } + } + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned i; + int ret = 0; + + for (i = 0; !ret && i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + ret = fn(ti, + rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, + data); + + return ret; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned chunk_size = rs->md.chunk_sectors << 9; + raid5_conf_t *conf = rs->md.private; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); +} + +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + md_stop_writes(&rs->md); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_suspend(&rs->md); +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_resume(&rs->md); +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 19a59b041c2..dee326775c6 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_request io_req = { .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, - .mem.ptr.bvec = NULL, + .mem.ptr.addr = NULL, .client = ms->io_client, }; @@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; + if (bio->bi_rw & REQ_DISCARD) { + io_req.bi_rw |= REQ_DISCARD; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) map_region(dest++, m, bio); @@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (bio->bi_rw & REQ_FLUSH) { + if ((bio->bi_rw & REQ_FLUSH) || + (bio->bi_rw & REQ_DISCARD)) { bio_list_add(&sync, bio); continue; } @@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = ms; ti->split_io = dm_rh_get_region_size(ms->rh); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; - ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti) del_timer_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); - flush_scheduled_work(); + flush_work_sync(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); @@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 12, 0}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2129cdb115d..95891dfcbca 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); - flush_workqueue(ps->metadata_wq); + flush_work(&req.work); return req.result; } @@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store, atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; - ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); if (!ps->metadata_wq) { kfree(ps); DMERR("couldn't start header metadata update thread"); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 53cf79d8bcb..fdde53cd12b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -19,7 +19,6 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> -#include <linux/workqueue.h> #include "dm-exception-store.h" @@ -80,9 +79,6 @@ struct dm_snapshot { /* Origin writes don't trigger exceptions until this is set */ int active; - /* Whether or not owning mapped_device is suspended */ - int suspended; - atomic_t pending_exceptions_count; mempool_t *pending_pool; @@ -106,10 +102,6 @@ struct dm_snapshot { struct dm_kcopyd_client *kcopyd_client; - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - /* Wait for events based on state_bits */ unsigned long state_bits; @@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s) } EXPORT_SYMBOL(dm_snap_cow); -static struct workqueue_struct *ksnapd; -static void flush_queued_bios(struct work_struct *work); - static sector_t chunk_to_sector(struct dm_exception_store *store, chunk_t chunk) { @@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; s->active = 0; - s->suspended = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); @@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); - bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios); - ti->private = s; ti->num_flush_requests = num_flush_requests; @@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti) struct dm_snapshot *s = ti->private; struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - flush_workqueue(ksnapd); - down_read(&_origins_lock); /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); @@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio) } } -static void flush_queued_bios(struct work_struct *work) -{ - struct dm_snapshot *s = - container_of(work, struct dm_snapshot, queued_bios_work); - struct bio *queued_bios; - unsigned long flags; - - spin_lock_irqsave(&s->pe_lock, flags); - queued_bios = bio_list_get(&s->queued_bios); - spin_unlock_irqrestore(&s->pe_lock, flags); - - flush_bios(queued_bios); -} - static int do_origin(struct dm_dev *origin, struct bio *bio); /* @@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti) stop_merge(s); } -static void snapshot_postsuspend(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - - down_write(&s->lock); - s->suspended = 1; - up_write(&s->lock); -} - static int snapshot_preresume(struct dm_target *ti) { int r = 0; @@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti) DMERR("Unable to resume snapshot source until " "handover completes."); r = -EINVAL; - } else if (!snap_src->suspended) { + } else if (!dm_suspended(snap_src->ti)) { DMERR("Unable to perform snapshot handover until " "source is suspended."); r = -EINVAL; @@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti) down_write(&s->lock); s->active = 1; - s->suspended = 0; up_write(&s->lock); } @@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 7, 0}, + .version = {1, 7, 1}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -2207,13 +2166,12 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, .end_io = snapshot_end_io, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_resume, .status = snapshot_status, @@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = { static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_merge_map, .end_io = snapshot_end_io, .presuspend = snapshot_merge_presuspend, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_merge_resume, .status = snapshot_status, @@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void) goto bad_tracked_chunk_cache; } - ksnapd = create_singlethread_workqueue("ksnapd"); - if (!ksnapd) { - DMERR("Failed to create ksnapd workqueue."); - r = -ENOMEM; - goto bad_pending_pool; - } - return 0; -bad_pending_pool: - kmem_cache_destroy(tracked_chunk_cache); bad_tracked_chunk_cache: kmem_cache_destroy(pending_cache); bad_pending_cache: @@ -2322,8 +2270,6 @@ bad_register_snapshot_target: static void __exit dm_snapshot_exit(void) { - destroy_workqueue(ksnapd); - dm_unregister_target(&snapshot_target); dm_unregister_target(&origin_target); dm_unregister_target(&merge_target); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index f0371b4c4fb..dddfa14f298 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -39,23 +39,20 @@ struct stripe_c { struct dm_target *ti; /* Work struct used for triggering events*/ - struct work_struct kstriped_ws; + struct work_struct trigger_event; struct stripe stripe[0]; }; -static struct workqueue_struct *kstriped; - /* * An event is triggered whenever a drive * drops out of a stripe volume. */ static void trigger_event(struct work_struct *work) { - struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); - + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); dm_table_event(sc->ti->table); - } static inline struct stripe_c *alloc_context(unsigned int stripes) @@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - INIT_WORK(&sc->kstriped_ws, trigger_event); + INIT_WORK(&sc->trigger_event, trigger_event); /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; @@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); - flush_workqueue(kstriped); + flush_work_sync(&sc->trigger_event); kfree(sc); } @@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, atomic_inc(&(sc->stripe[i].error_count)); if (atomic_read(&(sc->stripe[i].error_count)) < DM_IO_ERROR_THRESHOLD) - queue_work(kstriped, &sc->kstriped_ws); + schedule_work(&sc->trigger_event); } return error; @@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, @@ -422,20 +419,10 @@ int __init dm_stripe_init(void) return r; } - kstriped = create_singlethread_workqueue("kstriped"); - if (!kstriped) { - DMERR("failed to create workqueue kstriped"); - dm_unregister_target(&stripe_target); - return -ENOMEM; - } - return r; } void dm_stripe_exit(void) { dm_unregister_target(&stripe_target); - destroy_workqueue(kstriped); - - return; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 985c20a4f30..38e4eb1bb96 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -71,6 +71,8 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + + struct list_head target_callbacks; }; /* @@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); + INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); t->discards_supported = 1; @@ -347,6 +350,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) if (!d->dm_dev.bdev) return; + bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); d->dm_dev.bdev = NULL; } @@ -1225,10 +1229,17 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + int dm_table_any_congested(struct dm_table *t, int bdi_bits) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; int r = 0; list_for_each_entry(dd, devices, list) { @@ -1243,6 +1254,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) bdevname(dd->dm_dev.bdev, b)); } + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + return r; } @@ -1264,6 +1279,7 @@ void dm_table_unplug_all(struct dm_table *t) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); @@ -1276,6 +1292,10 @@ void dm_table_unplug_all(struct dm_table *t) dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); } + + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->unplug_fn) + cb->unplug_fn(cb); } struct mapped_device *dm_table_get_md(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f48a2f359ac..eaa3af0e063 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -32,7 +32,6 @@ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 -static DEFINE_MUTEX(dm_mutex); static const char *_name = DM_NAME; static unsigned int major = 0; @@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; - mutex_lock(&dm_mutex); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); - mutex_unlock(&dm_mutex); return md ? 0 : -ENXIO; } @@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; - mutex_lock(&dm_mutex); + spin_lock(&_minor_lock); + atomic_dec(&md->open_count); dm_put(md); - mutex_unlock(&dm_mutex); + + spin_unlock(&_minor_lock); return 0; } @@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q) if (map_request(ti, clone, md)) goto requeued; - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); } goto out; requeued: - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); plug_and_out: if (!elv_queue_empty(q)) @@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = create_singlethread_workqueue("kdmflush"); + md->wq = alloc_workqueue("kdmflush", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; @@ -1992,13 +1994,14 @@ static void event_callback(void *context) wake_up(&md->eventq); } +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->bdev->bd_inode->i_mutex); i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->bdev->bd_inode->i_mutex); } /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 7fc090ac9e2..0cc30ecda4c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -287,11 +287,14 @@ static int md_make_request(struct request_queue *q, struct bio *bio) mddev_t *mddev = q->queuedata; int rv; int cpu; + unsigned int sectors; - if (mddev == NULL || mddev->pers == NULL) { + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { bio_io_error(bio); return 0; } + smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -309,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio) atomic_inc(&mddev->active_io); rcu_read_unlock(); + /* + * save the sectors now since our bio can + * go away inside make_request + */ + sectors = bio_sectors(bio); rv = mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) @@ -703,9 +710,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) { - sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -763,7 +770,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -793,7 +800,7 @@ static void bi_complete(struct bio *bio, int error) } int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw) + struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; @@ -801,8 +808,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; + if (metadata_op) + bio->bi_sector = sector + rdev->sb_start; + else + bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); init_completion(&event); bio->bi_private = &event; @@ -827,7 +838,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; @@ -989,7 +1000,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -1330,7 +1341,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -1906,6 +1917,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) MD_BUG(); return; } + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; @@ -1940,8 +1952,6 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) __bdevname(dev, b)); return PTR_ERR(bdev); } - if (!shared) - set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } @@ -2458,6 +2468,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev->raid_disk != -1) return -EBUSY; + if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) + return -EBUSY; + if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; @@ -2465,6 +2478,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev2->raid_disk == slot) return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; @@ -2482,7 +2499,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ @@ -2598,12 +2616,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) mddev_lock(mddev); list_for_each_entry(rdev2, &mddev->disks, same_set) - if (test_bit(AllReserved, &rdev2->flags) || - (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors))) { + if (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->sectors, + rdev2->data_offset, + rdev2->sectors)) { overlap = 1; break; } @@ -3107,7 +3124,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) char nm[20]; if (rdev->raid_disk < 0) continue; - if (rdev->new_raid_disk > mddev->raid_disks) + if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; @@ -3736,6 +3753,8 @@ action_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", type); } +static void reap_sync_thread(mddev_t *mddev); + static ssize_t action_store(mddev_t *mddev, const char *page, size_t len) { @@ -3750,9 +3769,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -3904,7 +3921,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_sectors, resync; + unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); @@ -3915,7 +3932,7 @@ sync_completed_show(mddev_t *mddev, char *page) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; - return sprintf(page, "%lu / %lu\n", resync, max_sectors); + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -4002,19 +4019,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4031,20 +4053,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4422,7 +4448,9 @@ int md_run(mddev_t *mddev) * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { @@ -4556,7 +4584,8 @@ int md_run(mddev_t *mddev) mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - + smp_wmb(); + mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { char nm[20]; @@ -4693,13 +4722,12 @@ static void md_clean(mddev_t *mddev) mddev->plug = NULL; } -void md_stop_writes(mddev_t *mddev) +static void __md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; + reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -4713,10 +4741,18 @@ void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } + +void md_stop_writes(mddev_t *mddev) +{ + mddev_lock(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) { + mddev->ready = 0; mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; @@ -4736,7 +4772,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) goto out; } if (mddev->pers) { - md_stop_writes(mddev); + __md_stop_writes(mddev); err = -ENXIO; if (mddev->ro==1) @@ -4773,7 +4809,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); - md_stop_writes(mddev); + __md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; @@ -5151,9 +5187,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) && - info->raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + } else rdev->raid_disk = -1; } else super_types[mddev->major_version]. @@ -5230,7 +5267,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) printk(KERN_INFO "md: nonpersistent superblock ...\n"); rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); @@ -5297,7 +5334,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; @@ -5510,7 +5547,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits. - */ if (mddev->sync_thread) return -EBUSY; @@ -5547,6 +5583,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) mddev->delta_disks = raid_disks - mddev->raid_disks; rv = mddev->pers->check_reshape(mddev); + if (rv < 0) + mddev->delta_disks = 0; return rv; } @@ -6033,7 +6071,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + clear_bit(THREAD_WAKEUP, &thread->flags); + if (!kthread_should_stop()) thread->run(thread->mddev); } @@ -6799,7 +6838,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = j; while (j < max_sectors) { sector_t sectors; @@ -6817,8 +6856,7 @@ void md_do_sync(mddev_t *mddev) md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); - mddev->curr_resync_completed = - mddev->curr_resync; + mddev->curr_resync_completed = j; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -6954,9 +6992,6 @@ void md_do_sync(mddev_t *mddev) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - mddev->curr_resync_completed = 0; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -6997,7 +7032,7 @@ static int remove_and_add_spares(mddev_t *mddev) } } - if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { + if (mddev->degraded && !mddev->recovery_disabled) { list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -7023,6 +7058,45 @@ static int remove_and_add_spares(mddev_t *mddev) } return spares; } + +static void reap_sync_thread(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + md_update_sb(mddev, 1); + + /* if array is no-longer degraded, then any saved_raid_disk + * information must be scrapped + */ + if (!mddev->degraded) + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->saved_raid_disk = -1; + + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7047,9 +7121,6 @@ static int remove_and_add_spares(mddev_t *mddev) */ void md_check_recovery(mddev_t *mddev) { - mdk_rdev_t *rdev; - - if (mddev->bitmap) bitmap_daemon_work(mddev); @@ -7084,7 +7155,20 @@ void md_check_recovery(mddev_t *mddev) /* Only thing we do on a ro array is remove * failed devices. */ - remove_and_add_spares(mddev); + mdk_rdev_t *rdev; + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + if (mddev->pers->hot_remove_disk( + mddev, rdev->raid_disk)==0) { + char nm[20]; + sprintf(nm,"rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = -1; + } + } clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -7117,34 +7201,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - list_for_each_entry(rdev, &mddev->disks, same_set) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7202,7 +7259,11 @@ void md_check_recovery(mddev_t *mddev) " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); diff --git a/drivers/md/md.h b/drivers/md/md.h index d05bab55df4..7e90b8593b2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -60,6 +60,12 @@ struct mdk_rdev_s mddev_t *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct page *sb_page; @@ -87,8 +93,6 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ -#define AllReserved 6 /* If whole device is reserved for - * one array */ #define AutoDetected 7 /* added by auto-detect */ #define Blocked 8 /* An error occured on an externally * managed array, don't allow writes @@ -148,7 +152,8 @@ struct mddev_s * are happening, so run/ * takeover/stop are not safe */ - + int ready; /* See when safe to pass + * IO requests down */ struct gendisk *gendisk; struct kobject kobj; @@ -497,8 +502,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw); +extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a39f4c355e5..637a96855ed 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -179,6 +179,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) rdev1->new_raid_disk = j; } + if (mddev->level == 1) { + /* taiking over a raid1 array- + * we have only one active disk + */ + j = 0; + rdev1->new_raid_disk = j; + } + if (j < 0 || j >= mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: bad disk number %d - " "aborting!\n", mdname(mddev), j); @@ -644,12 +652,38 @@ static void *raid0_takeover_raid10(mddev_t *mddev) return priv_conf; } +static void *raid0_takeover_raid1(mddev_t *mddev) +{ + raid0_conf_t *priv_conf; + + /* Check layout: + * - (N - 1) mirror drives must be already faulty + */ + if ((mddev->raid_disks - 1) != mddev->degraded) { + printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */ + mddev->delta_disks = 1 - mddev->raid_disks; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + static void *raid0_takeover(mddev_t *mddev) { /* raid0 can take over: * raid4 - if all data disks are active. * raid5 - providing it is Raid4 layout and one disk is faulty * raid10 - assuming we have all necessary active disks + * raid1 - with (N -1) mirror drives faulty */ if (mddev->level == 4) return raid0_takeover_raid45(mddev); @@ -665,6 +699,12 @@ static void *raid0_takeover(mddev_t *mddev) if (mddev->level == 10) return raid0_takeover_raid10(mddev); + if (mddev->level == 1) + return raid0_takeover_raid1(mddev); + + printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", + mddev->level); + return ERR_PTR(-EINVAL); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 845cf95b612..a23ffa397ba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ)) { + READ, false)) { success = 1; break; } @@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev = conf->mirrors[d].rdev; atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - WRITE) == 0) + WRITE, false) == 0) md_error(mddev, rdev); } d = start; @@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) continue; rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ) == 0) + READ, false) == 0) md_error(mddev, rdev); } } else { @@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; @@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0641674827f..3b607b28741 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, - conf->tmppage, READ); + conf->tmppage, READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + sect, + s<<9, conf->tmppage, WRITE, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, conf->tmppage, - READ) == 0) { + READ, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE "md/raid10:%s: unable to read back " @@ -2462,11 +2463,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev) mddev->recovery_cp = MaxSector; conf = setup_conf(mddev); - if (!IS_ERR(conf)) + if (!IS_ERR(conf)) { list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) rdev->new_raid_disk = rdev->raid_disk * 2; - + conf->barrier = 1; + } + return conf; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc574f303f8..70281282419 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -5518,7 +5517,6 @@ static int raid5_start_reshape(mddev_t *mddev) raid5_conf_t *conf = mddev->private; mdk_rdev_t *rdev; int spares = 0; - int added_devices = 0; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -5528,8 +5526,8 @@ static int raid5_start_reshape(mddev_t *mddev) return -ENOSPC; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) spares++; if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) @@ -5572,29 +5570,35 @@ static int raid5_start_reshape(mddev_t *mddev) * to correctly record the "partially reconstructed" state of * such devices during the reshape and confusion could result. */ - if (mddev->delta_disks >= 0) - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev) == 0) { - char nm[20]; - if (rdev->raid_disk >= conf->previous_raid_disks) { - set_bit(In_sync, &rdev->flags); - added_devices++; - } else - rdev->recovery_offset = 0; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) - /* Failure here is OK */; - } else - break; - } + if (mddev->delta_disks >= 0) { + int added_devices = 0; + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev) == 0) { + char nm[20]; + if (rdev->raid_disk + >= conf->previous_raid_disks) { + set_bit(In_sync, &rdev->flags); + added_devices++; + } else + rdev->recovery_offset = 0; + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) + /* Failure here is OK */; + } + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + added_devices++; + } - /* When a reshape changes the number of devices, ->degraded - * is measured against the larger of the pre and post number of - * devices.*/ - if (mddev->delta_disks > 0) { + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post number of devices. + */ spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) - added_devices; |