summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/.gitignore4
-rw-r--r--drivers/md/bitmap.c12
-rw-r--r--drivers/md/dm-crypt.c344
-rw-r--r--drivers/md/dm-delay.c6
-rw-r--r--drivers/md/dm-exception-store.c4
-rw-r--r--drivers/md/dm-exception-store.h3
-rw-r--r--drivers/md/dm-io.c20
-rw-r--r--drivers/md/dm-ioctl.c208
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c11
-rw-r--r--drivers/md/dm-raid1.c10
-rw-r--r--drivers/md/dm-region-hash.c16
-rw-r--r--drivers/md/dm-snap-persistent.c8
-rw-r--r--drivers/md/dm-snap.c70
-rw-r--r--drivers/md/dm-stripe.c89
-rw-r--r--drivers/md/dm-table.c104
-rw-r--r--drivers/md/dm-target.c5
-rw-r--r--drivers/md/dm-zero.c5
-rw-r--r--drivers/md/dm.c697
-rw-r--r--drivers/md/dm.h14
-rw-r--r--drivers/md/linear.c4
-rw-r--r--drivers/md/md.c201
-rw-r--r--drivers/md/md.h25
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid0.c4
-rw-r--r--drivers/md/raid1.c197
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c24
-rw-r--r--drivers/md/raid5.c56
-rw-r--r--drivers/md/raid5.h1
31 files changed, 1090 insertions, 1063 deletions
diff --git a/drivers/md/.gitignore b/drivers/md/.gitignore
deleted file mode 100644
index a7afec6b19c..00000000000
--- a/drivers/md/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-mktables
-raid6altivec*.c
-raid6int*.c
-raid6tables.c
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1ba1e122e94..e4fb58db545 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1000,10 +1000,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
page = bitmap->sb_page;
offset = sizeof(bitmap_super_t);
if (!file)
- read_sb_page(bitmap->mddev,
- bitmap->mddev->bitmap_info.offset,
- page,
- index, count);
+ page = read_sb_page(
+ bitmap->mddev,
+ bitmap->mddev->bitmap_info.offset,
+ page,
+ index, count);
} else if (file) {
page = read_page(file, index, bitmap, count);
offset = 0;
@@ -1542,8 +1543,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
atomic_read(&bitmap->mddev->recovery_active) == 0);
bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
- if (bitmap->mddev->persistent)
- set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
+ set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3bdbb611570..d5b0e4c0e70 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -107,11 +107,10 @@ struct crypt_config {
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
- /*
- * crypto related data
- */
+ char *cipher;
+ char *cipher_mode;
+
struct crypt_iv_operations *iv_gen_ops;
- char *iv_mode;
union {
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
@@ -135,8 +134,6 @@ struct crypt_config {
unsigned int dmreq_start;
struct ablkcipher_request *req;
- char cipher[CRYPTO_MAX_ALG_NAME];
- char chainmode[CRYPTO_MAX_ALG_NAME];
struct crypto_ablkcipher *tfm;
unsigned long flags;
unsigned int key_size;
@@ -999,82 +996,135 @@ static int crypt_wipe_key(struct crypt_config *cc)
return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
-/*
- * Construct an encryption mapping:
- * <cipher> <key> <iv_offset> <dev_path> <start>
- */
-static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+static void crypt_dtr(struct dm_target *ti)
{
- struct crypt_config *cc;
- struct crypto_ablkcipher *tfm;
- char *tmp;
- char *cipher;
- char *chainmode;
- char *ivmode;
- char *ivopts;
- unsigned int key_size;
- unsigned long long tmpll;
+ struct crypt_config *cc = ti->private;
- if (argc != 5) {
- ti->error = "Not enough arguments";
+ ti->private = NULL;
+
+ if (!cc)
+ return;
+
+ if (cc->io_queue)
+ destroy_workqueue(cc->io_queue);
+ if (cc->crypt_queue)
+ destroy_workqueue(cc->crypt_queue);
+
+ if (cc->bs)
+ bioset_free(cc->bs);
+
+ if (cc->page_pool)
+ mempool_destroy(cc->page_pool);
+ if (cc->req_pool)
+ mempool_destroy(cc->req_pool);
+ if (cc->io_pool)
+ mempool_destroy(cc->io_pool);
+
+ if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
+ cc->iv_gen_ops->dtr(cc);
+
+ if (cc->tfm && !IS_ERR(cc->tfm))
+ crypto_free_ablkcipher(cc->tfm);
+
+ if (cc->dev)
+ dm_put_device(ti, cc->dev);
+
+ kzfree(cc->cipher);
+ kzfree(cc->cipher_mode);
+
+ /* Must zero key material before freeing */
+ kzfree(cc);
+}
+
+static int crypt_ctr_cipher(struct dm_target *ti,
+ char *cipher_in, char *key)
+{
+ struct crypt_config *cc = ti->private;
+ char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+ char *cipher_api = NULL;
+ int ret = -EINVAL;
+
+ /* Convert to crypto api definition? */
+ if (strchr(cipher_in, '(')) {
+ ti->error = "Bad cipher specification";
return -EINVAL;
}
- tmp = argv[0];
+ /*
+ * Legacy dm-crypt cipher specification
+ * cipher-mode-iv:ivopts
+ */
+ tmp = cipher_in;
cipher = strsep(&tmp, "-");
+
+ cc->cipher = kstrdup(cipher, GFP_KERNEL);
+ if (!cc->cipher)
+ goto bad_mem;
+
+ if (tmp) {
+ cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
+ if (!cc->cipher_mode)
+ goto bad_mem;
+ }
+
chainmode = strsep(&tmp, "-");
ivopts = strsep(&tmp, "-");
ivmode = strsep(&ivopts, ":");
if (tmp)
- DMWARN("Unexpected additional cipher options");
-
- key_size = strlen(argv[1]) >> 1;
-
- cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
- if (cc == NULL) {
- ti->error =
- "Cannot allocate transparent encryption context";
- return -ENOMEM;
- }
+ DMWARN("Ignoring unexpected additional cipher options");
- /* Compatibility mode for old dm-crypt cipher strings */
- if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
+ /* Compatibility mode for old dm-crypt mappings */
+ if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
+ kfree(cc->cipher_mode);
+ cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
chainmode = "cbc";
ivmode = "plain";
}
if (strcmp(chainmode, "ecb") && !ivmode) {
- ti->error = "This chaining mode requires an IV mechanism";
- goto bad_cipher;
+ ti->error = "IV mechanism required";
+ return -EINVAL;
}
- if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)",
- chainmode, cipher) >= CRYPTO_MAX_ALG_NAME) {
- ti->error = "Chain mode + cipher name is too long";
- goto bad_cipher;
+ cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL);
+ if (!cipher_api)
+ goto bad_mem;
+
+ ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
+ "%s(%s)", chainmode, cipher);
+ if (ret < 0) {
+ kfree(cipher_api);
+ goto bad_mem;
}
- tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
- if (IS_ERR(tfm)) {
+ /* Allocate cipher */
+ cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
+ if (IS_ERR(cc->tfm)) {
+ ret = PTR_ERR(cc->tfm);
ti->error = "Error allocating crypto tfm";
- goto bad_cipher;
+ goto bad;
}
- strcpy(cc->cipher, cipher);
- strcpy(cc->chainmode, chainmode);
- cc->tfm = tfm;
-
- if (crypt_set_key(cc, argv[1]) < 0) {
+ /* Initialize and set key */
+ ret = crypt_set_key(cc, key);
+ if (ret < 0) {
ti->error = "Error decoding and setting key";
- goto bad_ivmode;
+ goto bad;
}
- /*
- * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
- * See comments at iv code
- */
+ /* Initialize IV */
+ cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+ if (cc->iv_size)
+ /* at least a 64 bit sector number should fit in our buffer */
+ cc->iv_size = max(cc->iv_size,
+ (unsigned int)(sizeof(u64) / sizeof(u8)));
+ else if (ivmode) {
+ DMWARN("Selected cipher does not support IVs");
+ ivmode = NULL;
+ }
+ /* Choose ivmode, see comments at iv code. */
if (ivmode == NULL)
cc->iv_gen_ops = NULL;
else if (strcmp(ivmode, "plain") == 0)
@@ -1088,159 +1138,138 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
else if (strcmp(ivmode, "null") == 0)
cc->iv_gen_ops = &crypt_iv_null_ops;
else {
+ ret = -EINVAL;
ti->error = "Invalid IV mode";
- goto bad_ivmode;
+ goto bad;
}
- if (cc->iv_gen_ops && cc->iv_gen_ops->ctr &&
- cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
- goto bad_ivmode;
-
- if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
- cc->iv_gen_ops->init(cc) < 0) {
- ti->error = "Error initialising IV";
- goto bad_slab_pool;
+ /* Allocate IV */
+ if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
+ ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
+ if (ret < 0) {
+ ti->error = "Error creating IV";
+ goto bad;
+ }
}
- cc->iv_size = crypto_ablkcipher_ivsize(tfm);
- if (cc->iv_size)
- /* at least a 64 bit sector number should fit in our buffer */
- cc->iv_size = max(cc->iv_size,
- (unsigned int)(sizeof(u64) / sizeof(u8)));
- else {
- if (cc->iv_gen_ops) {
- DMWARN("Selected cipher does not support IVs");
- if (cc->iv_gen_ops->dtr)
- cc->iv_gen_ops->dtr(cc);
- cc->iv_gen_ops = NULL;
+ /* Initialize IV (set keys for ESSIV etc) */
+ if (cc->iv_gen_ops && cc->iv_gen_ops->init) {
+ ret = cc->iv_gen_ops->init(cc);
+ if (ret < 0) {
+ ti->error = "Error initialising IV";
+ goto bad;
}
}
+ ret = 0;
+bad:
+ kfree(cipher_api);
+ return ret;
+
+bad_mem:
+ ti->error = "Cannot allocate cipher strings";
+ return -ENOMEM;
+}
+
+/*
+ * Construct an encryption mapping:
+ * <cipher> <key> <iv_offset> <dev_path> <start>
+ */
+static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct crypt_config *cc;
+ unsigned int key_size;
+ unsigned long long tmpll;
+ int ret;
+
+ if (argc != 5) {
+ ti->error = "Not enough arguments";
+ return -EINVAL;
+ }
+
+ key_size = strlen(argv[1]) >> 1;
+
+ cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ if (!cc) {
+ ti->error = "Cannot allocate encryption context";
+ return -ENOMEM;
+ }
+
+ ti->private = cc;
+ ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
+ if (ret < 0)
+ goto bad;
+
+ ret = -ENOMEM;
cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
if (!cc->io_pool) {
ti->error = "Cannot allocate crypt io mempool";
- goto bad_slab_pool;
+ goto bad;
}
cc->dmreq_start = sizeof(struct ablkcipher_request);
- cc->dmreq_start += crypto_ablkcipher_reqsize(tfm);
+ cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) &
+ cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
~(crypto_tfm_ctx_alignment() - 1);
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
sizeof(struct dm_crypt_request) + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
- goto bad_req_pool;
+ goto bad;
}
cc->req = NULL;
cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool";
- goto bad_page_pool;
+ goto bad;
}
cc->bs = bioset_create(MIN_IOS, 0);
if (!cc->bs) {
ti->error = "Cannot allocate crypt bioset";
- goto bad_bs;
+ goto bad;
}
+ ret = -EINVAL;
if (sscanf(argv[2], "%llu", &tmpll) != 1) {
ti->error = "Invalid iv_offset sector";
- goto bad_device;
+ goto bad;
}
cc->iv_offset = tmpll;
- if (sscanf(argv[4], "%llu", &tmpll) != 1) {
- ti->error = "Invalid device sector";
- goto bad_device;
- }
- cc->start = tmpll;
-
if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) {
ti->error = "Device lookup failed";
- goto bad_device;
+ goto bad;
}
- if (ivmode && cc->iv_gen_ops) {
- if (ivopts)
- *(ivopts - 1) = ':';
- cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
- if (!cc->iv_mode) {
- ti->error = "Error kmallocing iv_mode string";
- goto bad_ivmode_string;
- }
- strcpy(cc->iv_mode, ivmode);
- } else
- cc->iv_mode = NULL;
+ if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+ ti->error = "Invalid device sector";
+ goto bad;
+ }
+ cc->start = tmpll;
+ ret = -ENOMEM;
cc->io_queue = create_singlethread_workqueue("kcryptd_io");
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
- goto bad_io_queue;
+ goto bad;
}
cc->crypt_queue = create_singlethread_workqueue("kcryptd");
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
- goto bad_crypt_queue;
+ goto bad;
}
ti->num_flush_requests = 1;
- ti->private = cc;
return 0;
-bad_crypt_queue:
- destroy_workqueue(cc->io_queue);
-bad_io_queue:
- kfree(cc->iv_mode);
-bad_ivmode_string:
- dm_put_device(ti, cc->dev);
-bad_device:
- bioset_free(cc->bs);
-bad_bs:
- mempool_destroy(cc->page_pool);
-bad_page_pool:
- mempool_destroy(cc->req_pool);
-bad_req_pool:
- mempool_destroy(cc->io_pool);
-bad_slab_pool:
- if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
- cc->iv_gen_ops->dtr(cc);
-bad_ivmode:
- crypto_free_ablkcipher(tfm);
-bad_cipher:
- /* Must zero key material before freeing */
- kzfree(cc);
- return -EINVAL;
-}
-
-static void crypt_dtr(struct dm_target *ti)
-{
- struct crypt_config *cc = (struct crypt_config *) ti->private;
-
- destroy_workqueue(cc->io_queue);
- destroy_workqueue(cc->crypt_queue);
-
- if (cc->req)
- mempool_free(cc->req, cc->req_pool);
-
- bioset_free(cc->bs);
- mempool_destroy(cc->page_pool);
- mempool_destroy(cc->req_pool);
- mempool_destroy(cc->io_pool);
-
- kfree(cc->iv_mode);
- if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
- cc->iv_gen_ops->dtr(cc);
- crypto_free_ablkcipher(cc->tfm);
- dm_put_device(ti, cc->dev);
-
- /* Must zero key material before freeing */
- kzfree(cc);
+bad:
+ crypt_dtr(ti);
+ return ret;
}
static int crypt_map(struct dm_target *ti, struct bio *bio,
@@ -1249,13 +1278,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
struct dm_crypt_io *io;
struct crypt_config *cc;
- if (unlikely(bio_empty_barrier(bio))) {
+ if (bio->bi_rw & REQ_FLUSH) {
cc = ti->private;
bio->bi_bdev = cc->dev->bdev;
return DM_MAPIO_REMAPPED;
}
- io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
+ io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
if (bio_data_dir(io->base_bio) == READ)
kcryptd_queue_io(io);
@@ -1268,7 +1297,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
static int crypt_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
- struct crypt_config *cc = (struct crypt_config *) ti->private;
+ struct crypt_config *cc = ti->private;
unsigned int sz = 0;
switch (type) {
@@ -1277,11 +1306,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- if (cc->iv_mode)
- DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode,
- cc->iv_mode);
+ if (cc->cipher_mode)
+ DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
else
- DMEMIT("%s-%s ", cc->cipher, cc->chainmode);
+ DMEMIT("%s ", cc->cipher);
if (cc->key_size > 0) {
if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1378,7 +1406,7 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return max_size;
bvm->bi_bdev = cc->dev->bdev;
- bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin;
+ bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector);
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 852052880d7..baa11912cc9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -198,6 +198,7 @@ out:
atomic_set(&dc->may_delay, 1);
ti->num_flush_requests = 1;
+ ti->num_discard_requests = 1;
ti->private = dc;
return 0;
@@ -281,14 +282,13 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
bio->bi_bdev = dc->dev_write->bdev;
if (bio_sectors(bio))
bio->bi_sector = dc->start_write +
- (bio->bi_sector - ti->begin);
+ dm_target_offset(ti, bio->bi_sector);
return delay_bio(dc, dc->write_delay, bio);
}
bio->bi_bdev = dc->dev_read->bdev;
- bio->bi_sector = dc->start_read +
- (bio->bi_sector - ti->begin);
+ bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
return delay_bio(dc, dc->read_delay, bio);
}
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 2b7907b6dd0..0bdb201c2c2 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -173,7 +173,9 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
/* Validate the chunk size against the device block size */
if (chunk_size %
- (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
+ (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) ||
+ chunk_size %
+ (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) {
*error = "Chunk size is not a multiple of device blocksize";
return -EINVAL;
}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index e8dfa06af3b..0b2536247cf 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -126,8 +126,9 @@ struct dm_exception_store {
};
/*
- * Obtain the cow device used by a given snapshot.
+ * Obtain the origin or cow device used by a given snapshot.
*/
+struct dm_dev *dm_snap_origin(struct dm_snapshot *snap);
struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
/*
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab..136d4f71a11 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@ struct dm_io_client {
*/
struct io {
unsigned long error_bits;
- unsigned long eopnotsupp_bits;
atomic_t count;
struct task_struct *sleeper;
struct dm_io_client *client;
@@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
*---------------------------------------------------------------*/
static void dec_count(struct io *io, unsigned int region, int error)
{
- if (error) {
+ if (error)
set_bit(region, &io->error_bits);
- if (error == -EOPNOTSUPP)
- set_bit(region, &io->eopnotsupp_bits);
- }
if (atomic_dec_and_test(&io->count)) {
if (io->sleeper)
@@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
sector_t remaining = where->count;
/*
- * where->count may be zero if rw holds a write barrier and we
- * need to send a zero-sized barrier.
+ * where->count may be zero if rw holds a flush and we need to
+ * send a zero-sized flush.
*/
do {
/*
@@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
*/
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
- if (where[i].count || (rw & REQ_HARDBARRIER))
+ if (where[i].count || (rw & REQ_FLUSH))
do_region(rw, i, where + i, dp, io);
}
@@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
return -EIO;
}
-retry:
io->error_bits = 0;
- io->eopnotsupp_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = current;
io->client = client;
@@ -412,11 +406,6 @@ retry:
}
set_current_state(TASK_RUNNING);
- if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
- rw &= ~REQ_HARDBARRIER;
- goto retry;
- }
-
if (error_bits)
*error_bits = io->error_bits;
@@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
- io->eopnotsupp_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = NULL;
io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d7500e1c26f..4b54618b415 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -249,55 +249,66 @@ static void __hash_remove(struct hash_cell *hc)
static void dm_hash_remove_all(int keep_open_devices)
{
- int i, dev_skipped, dev_removed;
+ int i, dev_skipped;
struct hash_cell *hc;
- struct list_head *tmp, *n;
+ struct mapped_device *md;
+
+retry:
+ dev_skipped = 0;
down_write(&_hash_lock);
-retry:
- dev_skipped = dev_removed = 0;
for (i = 0; i < NUM_BUCKETS; i++) {
- list_for_each_safe (tmp, n, _name_buckets + i) {
- hc = list_entry(tmp, struct hash_cell, name_list);
+ list_for_each_entry(hc, _name_buckets + i, name_list) {
+ md = hc->md;
+ dm_get(md);
- if (keep_open_devices &&
- dm_lock_for_deletion(hc->md)) {
+ if (keep_open_devices && dm_lock_for_deletion(md)) {
+ dm_put(md);
dev_skipped++;
continue;
}
+
__hash_remove(hc);
- dev_removed = 1;
- }
- }
- /*
- * Some mapped devices may be using other mapped devices, so if any
- * still exist, repeat until we make no further progress.
- */
- if (dev_skipped) {
- if (dev_removed)
- goto retry;
+ up_write(&_hash_lock);
- DMWARN("remove_all left %d open device(s)", dev_skipped);
+ dm_put(md);
+ if (likely(keep_open_devices))
+ dm_destroy(md);
+ else
+ dm_destroy_immediate(md);
+
+ /*
+ * Some mapped devices may be using other mapped
+ * devices, so repeat until we make no further
+ * progress. If a new mapped device is created
+ * here it will also get removed.
+ */
+ goto retry;
+ }
}
up_write(&_hash_lock);
+
+ if (dev_skipped)
+ DMWARN("remove_all left %d open device(s)", dev_skipped);
}
-static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old,
- const char *new)
+static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
+ const char *new)
{
char *new_name, *old_name;
struct hash_cell *hc;
struct dm_table *table;
+ struct mapped_device *md;
/*
* duplicate new.
*/
new_name = kstrdup(new, GFP_KERNEL);
if (!new_name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
down_write(&_hash_lock);
@@ -306,24 +317,24 @@ static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old,
*/
hc = __get_name_cell(new);
if (hc) {
- DMWARN("asked to rename to an already existing name %s -> %s",
- old, new);
+ DMWARN("asked to rename to an already-existing name %s -> %s",
+ param->name, new);
dm_put(hc->md);
up_write(&_hash_lock);
kfree(new_name);
- return -EBUSY;
+ return ERR_PTR(-EBUSY);
}
/*
* Is there such a device as 'old' ?
*/
- hc = __get_name_cell(old);
+ hc = __get_name_cell(param->name);
if (!hc) {
- DMWARN("asked to rename a non existent device %s -> %s",
- old, new);
+ DMWARN("asked to rename a non-existent device %s -> %s",
+ param->name, new);
up_write(&_hash_lock);
kfree(new_name);
- return -ENXIO;
+ return ERR_PTR(-ENXIO);
}
/*
@@ -345,13 +356,14 @@ static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old,
dm_table_put(table);
}
- if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie))
- *flags |= DM_UEVENT_GENERATED_FLAG;
+ if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
+ param->flags |= DM_UEVENT_GENERATED_FLAG;
- dm_put(hc->md);
+ md = hc->md;
up_write(&_hash_lock);
kfree(old_name);
- return 0;
+
+ return md;
}
/*-----------------------------------------------------------------
@@ -573,7 +585,7 @@ static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
* Fills in a dm_ioctl structure, ready for sending back to
* userland.
*/
-static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
+static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
{
struct gendisk *disk = dm_disk(md);
struct dm_table *table;
@@ -617,8 +629,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
dm_table_put(table);
}
}
-
- return 0;
}
static int dev_create(struct dm_ioctl *param, size_t param_size)
@@ -640,15 +650,17 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
if (r) {
dm_put(md);
+ dm_destroy(md);
return r;
}
param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
- r = __dev_status(md, param);
+ __dev_status(md, param);
+
dm_put(md);
- return r;
+ return 0;
}
/*
@@ -742,6 +754,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
param->flags |= DM_UEVENT_GENERATED_FLAG;
dm_put(md);
+ dm_destroy(md);
return 0;
}
@@ -762,6 +775,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
{
int r;
char *new_name = (char *) param + param->data_start;
+ struct mapped_device *md;
if (new_name < param->data ||
invalid_str(new_name, (void *) param + param_size) ||
@@ -774,10 +788,14 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
if (r)
return r;
- param->data_size = 0;
+ md = dm_hash_rename(param, new_name);
+ if (IS_ERR(md))
+ return PTR_ERR(md);
+
+ __dev_status(md, param);
+ dm_put(md);
- return dm_hash_rename(param->event_nr, &param->flags, param->name,
- new_name);
+ return 0;
}
static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -818,8 +836,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
geometry.start = indata[3];
r = dm_set_geometry(md, &geometry);
- if (!r)
- r = __dev_status(md, param);
param->data_size = 0;
@@ -843,13 +859,17 @@ static int do_suspend(struct dm_ioctl *param)
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
- if (!dm_suspended_md(md))
+ if (!dm_suspended_md(md)) {
r = dm_suspend(md, suspend_flags);
+ if (r)
+ goto out;
+ }
- if (!r)
- r = __dev_status(md, param);
+ __dev_status(md, param);
+out:
dm_put(md);
+
return r;
}
@@ -911,7 +931,7 @@ static int do_resume(struct dm_ioctl *param)
dm_table_destroy(old_map);
if (!r)
- r = __dev_status(md, param);
+ __dev_status(md, param);
dm_put(md);
return r;
@@ -935,16 +955,16 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size)
*/
static int dev_status(struct dm_ioctl *param, size_t param_size)
{
- int r;
struct mapped_device *md;
md = find_device(param);
if (!md)
return -ENXIO;
- r = __dev_status(md, param);
+ __dev_status(md, param);
dm_put(md);
- return r;
+
+ return 0;
}
/*
@@ -1019,7 +1039,7 @@ static void retrieve_status(struct dm_table *table,
*/
static int dev_wait(struct dm_ioctl *param, size_t param_size)
{
- int r;
+ int r = 0;
struct mapped_device *md;
struct dm_table *table;
@@ -1040,9 +1060,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
* changed to trigger the event, so we may as well tell
* him and save an ioctl.
*/
- r = __dev_status(md, param);
- if (r)
- goto out;
+ __dev_status(md, param);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
@@ -1050,8 +1068,9 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
dm_table_put(table);
}
- out:
+out:
dm_put(md);
+
return r;
}
@@ -1112,28 +1131,9 @@ static int populate_table(struct dm_table *table,
next = spec->next;
}
- r = dm_table_set_type(table);
- if (r) {
- DMWARN("unable to set table type");
- return r;
- }
-
return dm_table_complete(table);
}
-static int table_prealloc_integrity(struct dm_table *t,
- struct mapped_device *md)
-{
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_dev_internal *dd;
-
- list_for_each_entry(dd, devices, list)
- if (bdev_get_integrity(dd->dm_dev.bdev))
- return blk_integrity_register(dm_disk(md), NULL);
-
- return 0;
-}
-
static int table_load(struct dm_ioctl *param, size_t param_size)
{
int r;
@@ -1155,21 +1155,30 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
goto out;
}
- r = table_prealloc_integrity(t, md);
- if (r) {
- DMERR("%s: could not register integrity profile.",
- dm_device_name(md));
+ /* Protect md->type and md->queue against concurrent table loads. */
+ dm_lock_md_type(md);
+ if (dm_get_md_type(md) == DM_TYPE_NONE)
+ /* Initial table load: acquire type of table. */
+ dm_set_md_type(md, dm_table_get_type(t));
+ else if (dm_get_md_type(md) != dm_table_get_type(t)) {
+ DMWARN("can't change device type after initial table load.");
dm_table_destroy(t);
+ dm_unlock_md_type(md);
+ r = -EINVAL;
goto out;
}
- r = dm_table_alloc_md_mempools(t);
+ /* setup md->queue to reflect md's type (may block) */
+ r = dm_setup_md_queue(md);
if (r) {
- DMWARN("unable to allocate mempools for this table");
+ DMWARN("unable to set up device queue for new table.");
dm_table_destroy(t);
+ dm_unlock_md_type(md);
goto out;
}
+ dm_unlock_md_type(md);
+ /* stage inactive table */
down_write(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
@@ -1186,7 +1195,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
up_write(&_hash_lock);
param->flags |= DM_INACTIVE_PRESENT_FLAG;
- r = __dev_status(md, param);
+ __dev_status(md, param);
out:
dm_put(md);
@@ -1196,7 +1205,6 @@ out:
static int table_clear(struct dm_ioctl *param, size_t param_size)
{
- int r;
struct hash_cell *hc;
struct mapped_device *md;
@@ -1216,11 +1224,12 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
- r = __dev_status(hc->md, param);
+ __dev_status(hc->md, param);
md = hc->md;
up_write(&_hash_lock);
dm_put(md);
- return r;
+
+ return 0;
}
/*
@@ -1265,7 +1274,6 @@ static void retrieve_deps(struct dm_table *table,
static int table_deps(struct dm_ioctl *param, size_t param_size)
{
- int r = 0;
struct mapped_device *md;
struct dm_table *table;
@@ -1273,9 +1281,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
if (!md)
return -ENXIO;
- r = __dev_status(md, param);
- if (r)
- goto out;
+ __dev_status(md, param);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
@@ -1283,9 +1289,9 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
dm_table_put(table);
}
- out:
dm_put(md);
- return r;
+
+ return 0;
}
/*
@@ -1294,7 +1300,6 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
*/
static int table_status(struct dm_ioctl *param, size_t param_size)
{
- int r;
struct mapped_device *md;
struct dm_table *table;
@@ -1302,9 +1307,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
if (!md)
return -ENXIO;
- r = __dev_status(md, param);
- if (r)
- goto out;
+ __dev_status(md, param);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
@@ -1312,9 +1315,9 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
dm_table_put(table);
}
-out:
dm_put(md);
- return r;
+
+ return 0;
}
/*
@@ -1333,10 +1336,6 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
if (!md)
return -ENXIO;
- r = __dev_status(md, param);
- if (r)
- goto out;
-
if (tmsg < (struct dm_target_msg *) param->data ||
invalid_str(tmsg->message, (void *) param + param_size)) {
DMWARN("Invalid target message parameters.");
@@ -1593,18 +1592,23 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
#endif
static const struct file_operations _ctl_fops = {
+ .open = nonseekable_open,
.unlocked_ioctl = dm_ctl_ioctl,
.compat_ioctl = dm_compat_ctl_ioctl,
.owner = THIS_MODULE,
+ .llseek = noop_llseek,
};
static struct miscdevice _dm_misc = {
- .minor = MISC_DYNAMIC_MINOR,
+ .minor = MAPPER_CTRL_MINOR,
.name = DM_NAME,
- .nodename = "mapper/control",
+ .nodename = DM_DIR "/" DM_CONTROL_NODE,
.fops = &_ctl_fops
};
+MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR);
+MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE);
+
/*
* Create misc character device and link to DM_DIR/control.
*/
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9200dbf2391..3921e3bb43c 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ti->num_flush_requests = 1;
+ ti->num_discard_requests = 1;
ti->private = lc;
return 0;
@@ -73,7 +74,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
{
struct linear_c *lc = ti->private;
- return lc->start + (bi_sector - ti->begin);
+ return lc->start + dm_target_offset(ti, bi_sector);
}
static void linear_map_bio(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222d..33420e68d15 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
.count = 0,
};
- lc->io_req.bi_rw = WRITE_BARRIER;
+ lc->io_req.bi_rw = WRITE_FLUSH;
return dm_io(&lc->io_req, 1, &null_location, NULL);
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 826bce7343b..487ecda90ad 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -706,6 +706,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
if (as->argc < nr_params) {
ti->error = "not enough path parameters";
+ r = -EINVAL;
goto bad;
}
@@ -892,6 +893,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
}
ti->num_flush_requests = 1;
+ ti->num_discard_requests = 1;
return 0;
@@ -1271,6 +1273,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
if (error == -EOPNOTSUPP)
return error;
+ if (clone->cmd_flags & REQ_DISCARD)
+ /*
+ * Pass all discard request failures up.
+ * FIXME: only fail_path if the discard failed due to a
+ * transport problem. This requires precise understanding
+ * of the underlying failure (e.g. the SCSI sense).
+ */
+ return error;
+
if (mpio->pgpath)
fail_path(mpio->pgpath);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 74136262d65..19a59b041c2 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti)
struct dm_io_region io[ms->nr_mirrors];
struct mirror *m;
struct dm_io_request io_req = {
- .bi_rw = WRITE_BARRIER,
+ .bi_rw = WRITE_FLUSH,
.mem.type = DM_IO_KMEM,
.mem.ptr.bvec = NULL,
.client = ms->io_client,
@@ -445,7 +445,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio)
{
if (unlikely(!bio->bi_size))
return 0;
- return m->offset + (bio->bi_sector - m->ms->ti->begin);
+ return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector);
}
static void map_bio(struct mirror *m, struct bio *bio)
@@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
struct dm_io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m;
struct dm_io_request io_req = {
- .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+ .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
.mem.type = DM_IO_BVEC,
.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
.notify.fn = write_callback,
@@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
- if (unlikely(bio_empty_barrier(bio))) {
+ if (bio->bi_rw & REQ_FLUSH) {
bio_list_add(&sync, bio);
continue;
}
@@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* We need to dec pending if this was a write.
*/
if (rw == WRITE) {
- if (likely(!bio_empty_barrier(bio)))
+ if (!(bio->bi_rw & REQ_FLUSH))
dm_rh_dec(ms->rh, map_context->ll);
return error;
}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b2886..dad011aed0c 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
struct list_head failed_recovered_regions;
/*
- * If there was a barrier failure no regions can be marked clean.
+ * If there was a flush failure no regions can be marked clean.
*/
- int barrier_failure;
+ int flush_failure;
void *context;
sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
INIT_LIST_HEAD(&rh->failed_recovered_regions);
- rh->barrier_failure = 0;
+ rh->flush_failure = 0;
rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
region_t region = dm_rh_bio_to_region(rh, bio);
int recovering = 0;
- if (bio_empty_barrier(bio)) {
- rh->barrier_failure = 1;
+ if (bio->bi_rw & REQ_FLUSH) {
+ rh->flush_failure = 1;
return;
}
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
struct bio *bio;
for (bio = bios->head; bio; bio = bio->bi_next) {
- if (bio_empty_barrier(bio))
+ if (bio->bi_rw & REQ_FLUSH)
continue;
rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
*/
/* do nothing for DM_RH_NOSYNC */
- if (unlikely(rh->barrier_failure)) {
+ if (unlikely(rh->flush_failure)) {
/*
- * If a write barrier failed some time ago, we
+ * If a write flush failed some time ago, we
* don't know whether or not this write made it
* to the disk, so we must resync the device.
*/
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index c097d8a4823..0b61792a278 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -266,7 +266,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
*/
static chunk_t area_location(struct pstore *ps, chunk_t area)
{
- return 1 + ((ps->exceptions_per_area + 1) * area);
+ return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area);
}
/*
@@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
/*
* Commit exceptions to disk.
*/
- if (ps->valid && area_io(ps, WRITE_BARRIER))
+ if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
ps->valid = 0;
/*
@@ -780,8 +780,8 @@ static int persistent_commit_merge(struct dm_exception_store *store,
* ps->current_area does not get reduced by prepare_merge() until
* after commit_merge() has removed the nr_merged previous exceptions.
*/
- ps->next_free = (area_location(ps, ps->current_area) - 1) +
- (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
+ ps->next_free = area_location(ps, ps->current_area) +
+ ps->current_committed + 1;
return 0;
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 54853773510..53cf79d8bcb 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -148,6 +148,12 @@ struct dm_snapshot {
#define RUNNING_MERGE 0
#define SHUTDOWN_MERGE 1
+struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
+{
+ return s->origin;
+}
+EXPORT_SYMBOL(dm_snap_origin);
+
struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
{
return s->cow;
@@ -700,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
return 0;
}
-#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
-
/*
* Return a minimum chunk size of all snapshots that have the specified origin.
* Return zero if the origin has no snapshots.
@@ -1065,10 +1069,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
origin_mode = FMODE_WRITE;
}
- origin_path = argv[0];
- argv++;
- argc--;
-
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
ti->error = "Cannot allocate snapshot context private "
@@ -1077,6 +1077,16 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ origin_path = argv[0];
+ argv++;
+ argc--;
+
+ r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
+ if (r) {
+ ti->error = "Cannot get origin device";
+ goto bad_origin;
+ }
+
cow_path = argv[0];
argv++;
argc--;
@@ -1097,12 +1107,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
argv += args_used;
argc -= args_used;
- r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
- if (r) {
- ti->error = "Cannot get origin device";
- goto bad_origin;
- }
-
s->ti = ti;
s->valid = 1;
s->active = 0;
@@ -1212,15 +1216,15 @@ bad_kcopyd:
dm_exception_table_exit(&s->complete, exception_cache);
bad_hash_tables:
- dm_put_device(ti, s->origin);
-
-bad_origin:
dm_exception_store_destroy(s->store);
bad_store:
dm_put_device(ti, s->cow);
bad_cow:
+ dm_put_device(ti, s->origin);
+
+bad_origin:
kfree(s);
bad:
@@ -1314,12 +1318,12 @@ static void snapshot_dtr(struct dm_target *ti)
mempool_destroy(s->pending_pool);
- dm_put_device(ti, s->origin);
-
dm_exception_store_destroy(s->store);
dm_put_device(ti, s->cow);
+ dm_put_device(ti, s->origin);
+
kfree(s);
}
@@ -1581,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
- if (unlikely(bio_empty_barrier(bio))) {
+ if (bio->bi_rw & REQ_FLUSH) {
bio->bi_bdev = s->cow->bdev;
return DM_MAPIO_REMAPPED;
}
@@ -1685,8 +1689,8 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
- if (unlikely(bio_empty_barrier(bio))) {
- if (!map_context->flush_request)
+ if (bio->bi_rw & REQ_FLUSH) {
+ if (!map_context->target_request_nr)
bio->bi_bdev = s->origin->bdev;
else
bio->bi_bdev = s->cow->bdev;
@@ -1899,8 +1903,14 @@ static int snapshot_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct dm_snapshot *snap = ti->private;
+ int r;
+
+ r = fn(ti, snap->origin, 0, ti->len, data);
+
+ if (!r)
+ r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
- return fn(ti, snap->origin, 0, ti->len, data);
+ return r;
}
@@ -2123,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
struct dm_dev *dev = ti->private;
bio->bi_bdev = dev->bdev;
- if (unlikely(bio_empty_barrier(bio)))
+ if (bio->bi_rw & REQ_FLUSH)
return DM_MAPIO_REMAPPED;
/* Only tell snapshots if this is a write */
@@ -2159,6 +2169,21 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
return 0;
}
+static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct dm_dev *dev = ti->private;
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = dev->bdev;
+ bvm->bi_sector = bvm->bi_sector;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
static int origin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -2176,6 +2201,7 @@ static struct target_type origin_target = {
.map = origin_map,
.resume = origin_resume,
.status = origin_status,
+ .merge = origin_merge,
.iterate_devices = origin_iterate_devices,
};
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d6e28d732b4..f0371b4c4fb 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -25,6 +25,8 @@ struct stripe {
struct stripe_c {
uint32_t stripes;
+ int stripes_shift;
+ sector_t stripes_mask;
/* The size of this target / num. stripes */
sector_t stripe_width;
@@ -162,16 +164,22 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Set pointer to dm target; used in trigger_event */
sc->ti = ti;
-
sc->stripes = stripes;
sc->stripe_width = width;
+
+ if (stripes & (stripes - 1))
+ sc->stripes_shift = -1;
+ else {
+ sc->stripes_shift = ffs(stripes) - 1;
+ sc->stripes_mask = ((sector_t) stripes) - 1;
+ }
+
ti->split_io = chunk_size;
ti->num_flush_requests = stripes;
+ ti->num_discard_requests = stripes;
+ sc->chunk_shift = ffs(chunk_size) - 1;
sc->chunk_mask = ((sector_t) chunk_size) - 1;
- for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
- chunk_size >>= 1;
- sc->chunk_shift--;
/*
* Get the stripe destinations.
@@ -207,26 +215,79 @@ static void stripe_dtr(struct dm_target *ti)
kfree(sc);
}
+static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
+ uint32_t *stripe, sector_t *result)
+{
+ sector_t offset = dm_target_offset(sc->ti, sector);
+ sector_t chunk = offset >> sc->chunk_shift;
+
+ if (sc->stripes_shift < 0)
+ *stripe = sector_div(chunk, sc->stripes);
+ else {
+ *stripe = chunk & sc->stripes_mask;
+ chunk >>= sc->stripes_shift;
+ }
+
+ *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask);
+}
+
+static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
+ uint32_t target_stripe, sector_t *result)
+{
+ uint32_t stripe;
+
+ stripe_map_sector(sc, sector, &stripe, result);
+ if (stripe == target_stripe)
+ return;
+ *result &= ~sc->chunk_mask; /* round down */
+ if (target_stripe < stripe)
+ *result += sc->chunk_mask + 1; /* next chunk */
+}
+
+static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
+ uint32_t target_stripe)
+{
+ sector_t begin, end;
+
+ stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
+ stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio),
+ target_stripe, &end);
+ if (begin < end) {
+ bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
+ bio->bi_sector = begin + sc->stripe[target_stripe].physical_start;
+ bio->bi_size = to_bytes(end - begin);
+ return DM_MAPIO_REMAPPED;
+ } else {
+ /* The range doesn't map to the target stripe */
+ bio_endio(bio, 0);
+ return DM_MAPIO_SUBMITTED;
+ }
+}
+
static int stripe_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
- struct stripe_c *sc = (struct stripe_c *) ti->private;
- sector_t offset, chunk;
+ struct stripe_c *sc = ti->private;
uint32_t stripe;
+ unsigned target_request_nr;
- if (unlikely(bio_empty_barrier(bio))) {
- BUG_ON(map_context->flush_request >= sc->stripes);
- bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+ if (bio->bi_rw & REQ_FLUSH) {
+ target_request_nr = map_context->target_request_nr;
+ BUG_ON(target_request_nr >= sc->stripes);
+ bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
return DM_MAPIO_REMAPPED;
}
+ if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+ target_request_nr = map_context->target_request_nr;
+ BUG_ON(target_request_nr >= sc->stripes);
+ return stripe_map_discard(sc, bio, target_request_nr);
+ }
- offset = bio->bi_sector - ti->begin;
- chunk = offset >> sc->chunk_shift;
- stripe = sector_div(chunk, sc->stripes);
+ stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
+ bio->bi_sector += sc->stripe[stripe].physical_start;
bio->bi_bdev = sc->stripe[stripe].dev->bdev;
- bio->bi_sector = sc->stripe[stripe].physical_start +
- (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
+
return DM_MAPIO_REMAPPED;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9924ea23032..90267f8d64e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,6 +54,8 @@ struct dm_table {
sector_t *highs;
struct dm_target *targets;
+ unsigned discards_supported:1;
+
/*
* Indicates the rw permissions for the new logical
* device. This should be a combination of FMODE_READ
@@ -203,6 +205,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
INIT_LIST_HEAD(&t->devices);
atomic_set(&t->holders, 0);
+ t->discards_supported = 1;
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -245,7 +248,7 @@ void dm_table_destroy(struct dm_table *t)
msleep(1);
smp_mb();
- /* free the indexes (see dm_table_complete) */
+ /* free the indexes */
if (t->depth >= 2)
vfree(t->index[t->depth - 2]);
@@ -483,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
return 0;
}
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -770,6 +768,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
+ if (!tgt->num_discard_requests)
+ t->discards_supported = 0;
+
return 0;
bad:
@@ -778,7 +779,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
return r;
}
-int dm_table_set_type(struct dm_table *t)
+static int dm_table_set_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0;
@@ -900,7 +901,7 @@ static int setup_indexes(struct dm_table *t)
/*
* Builds the btree to index the map.
*/
-int dm_table_complete(struct dm_table *t)
+static int dm_table_build_index(struct dm_table *t)
{
int r = 0;
unsigned int leaf_nodes;
@@ -919,6 +920,55 @@ int dm_table_complete(struct dm_table *t)
return r;
}
+/*
+ * Register the mapped device for blk_integrity support if
+ * the underlying devices support it.
+ */
+static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry(dd, devices, list)
+ if (bdev_get_integrity(dd->dm_dev.bdev))
+ return blk_integrity_register(dm_disk(md), NULL);
+
+ return 0;
+}
+
+/*
+ * Prepares the table for use by building the indices,
+ * setting the type, and allocating mempools.
+ */
+int dm_table_complete(struct dm_table *t)
+{
+ int r;
+
+ r = dm_table_set_type(t);
+ if (r) {
+ DMERR("unable to set table type");
+ return r;
+ }
+
+ r = dm_table_build_index(t);
+ if (r) {
+ DMERR("unable to build btrees");
+ return r;
+ }
+
+ r = dm_table_prealloc_integrity(t, t->md);
+ if (r) {
+ DMERR("could not register integrity profile.");
+ return r;
+ }
+
+ r = dm_table_alloc_md_mempools(t);
+ if (r)
+ DMERR("unable to allocate mempools");
+
+ return r;
+}
+
static DEFINE_MUTEX(_event_lock);
void dm_table_event_callback(struct dm_table *t,
void (*fn)(void *), void *context)
@@ -1086,6 +1136,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
+ if (!dm_table_supports_discards(t))
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+
dm_table_set_integrity(t);
/*
@@ -1232,6 +1287,39 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
return t->md;
}
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && blk_queue_discard(q);
+}
+
+bool dm_table_supports_discards(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ if (!t->discards_supported)
+ return 0;
+
+ /*
+ * Ensure that at least one underlying device supports discards.
+ * t->devices includes internal dm devices such as mirror logs
+ * so we need to use iterate_devices here, which targets
+ * supporting discard must provide.
+ */
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_discard_capable, NULL))
+ return 1;
+ }
+
+ return 0;
+}
+
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 11dea11dc0b..8da366cf381 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -113,6 +113,11 @@ void dm_unregister_target(struct target_type *tt)
*/
static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
{
+ /*
+ * Return error for discards instead of -EOPNOTSUPP
+ */
+ tt->num_discard_requests = 1;
+
return 0;
}
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index bbc97030c0c..cc2b3cb8194 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -22,6 +22,11 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -EINVAL;
}
+ /*
+ * Silently drop discards, avoiding -EOPNOTSUPP.
+ */
+ ti->num_discard_requests = 1;
+
return 0;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a3f21dc02bd..7cb1352f7e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -15,11 +15,11 @@
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
+#include <linux/delay.h>
#include <trace/events/block.h>
@@ -32,6 +32,7 @@
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24
+static DEFINE_MUTEX(dm_mutex);
static const char *_name = DM_NAME;
static unsigned int major = 0;
@@ -109,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_QUEUE_IO_TO_THREAD 6
/*
* Work processed by per-device workqueue.
@@ -124,6 +124,10 @@ struct mapped_device {
unsigned long flags;
struct request_queue *queue;
+ unsigned type;
+ /* Protect queue and type against concurrent access. */
+ struct mutex type_lock;
+
struct gendisk *disk;
char name[16];
@@ -139,24 +143,9 @@ struct mapped_device {
spinlock_t deferred_lock;
/*
- * An error from the barrier request currently being processed.
- */
- int barrier_error;
-
- /*
- * Protect barrier_error from concurrent endio processing
- * in request-based dm.
- */
- spinlock_t barrier_error_lock;
-
- /*
- * Processing queue (flush/barriers)
+ * Processing queue (flush)
*/
struct workqueue_struct *wq;
- struct work_struct barrier_work;
-
- /* A pointer to the currently processing pre/post flush request */
- struct request *flush_request;
/*
* The current mapping.
@@ -195,8 +184,8 @@ struct mapped_device {
/* sysfs handle */
struct kobject kobj;
- /* zero-length barrier that will be cloned and submitted to targets */
- struct bio barrier_bio;
+ /* zero-length flush that will be cloned and submitted to targets */
+ struct bio flush_bio;
};
/*
@@ -339,7 +328,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
- lock_kernel();
+ mutex_lock(&dm_mutex);
spin_lock(&_minor_lock);
md = bdev->bd_disk->private_data;
@@ -357,7 +346,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
out:
spin_unlock(&_minor_lock);
- unlock_kernel();
+ mutex_unlock(&dm_mutex);
return md ? 0 : -ENXIO;
}
@@ -366,10 +355,10 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
{
struct mapped_device *md = disk->private_data;
- lock_kernel();
+ mutex_lock(&dm_mutex);
atomic_dec(&md->open_count);
dm_put(md);
- unlock_kernel();
+ mutex_unlock(&dm_mutex);
return 0;
}
@@ -507,7 +496,7 @@ static void end_io_acct(struct dm_io *io)
/*
* After this is decremented the bio must not be touched if it is
- * a barrier.
+ * a flush.
*/
dm_disk(md)->part0.in_flight[rw] = pending =
atomic_dec_return(&md->pending[rw]);
@@ -523,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
*/
static void queue_io(struct mapped_device *md, struct bio *bio)
{
- down_write(&md->io_lock);
+ unsigned long flags;
- spin_lock_irq(&md->deferred_lock);
+ spin_lock_irqsave(&md->deferred_lock, flags);
bio_list_add(&md->deferred, bio);
- spin_unlock_irq(&md->deferred_lock);
-
- if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
- queue_work(md->wq, &md->work);
-
- up_write(&md->io_lock);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
+ queue_work(md->wq, &md->work);
}
/*
@@ -620,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md)) {
- if (!(io->bio->bi_rw & REQ_HARDBARRIER))
- bio_list_add_head(&md->deferred,
- io->bio);
- } else
+ if (__noflush_suspending(md))
+ bio_list_add_head(&md->deferred, io->bio);
+ else
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -632,26 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
io_error = io->error;
bio = io->bio;
+ end_io_acct(io);
+ free_io(md, io);
+
+ if (io_error == DM_ENDIO_REQUEUE)
+ return;
- if (bio->bi_rw & REQ_HARDBARRIER) {
+ if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
/*
- * There can be just one barrier request so we use
- * a per-device variable for error reporting.
- * Note that you can't touch the bio after end_io_acct
+ * Preflush done for flush with data, reissue
+ * without REQ_FLUSH.
*/
- if (!md->barrier_error && io_error != -EOPNOTSUPP)
- md->barrier_error = io_error;
- end_io_acct(io);
- free_io(md, io);
+ bio->bi_rw &= ~REQ_FLUSH;
+ queue_io(md, bio);
} else {
- end_io_acct(io);
- free_io(md, io);
-
- if (io_error != DM_ENDIO_REQUEUE) {
- trace_block_bio_complete(md->queue, bio);
-
- bio_endio(bio, io_error);
- }
+ /* done with normal IO or empty flush */
+ trace_block_bio_complete(md->queue, bio);
+ bio_endio(bio, io_error);
}
}
}
@@ -744,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
blk_update_request(tio->orig, 0, nr_bytes);
}
-static void store_barrier_error(struct mapped_device *md, int error)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&md->barrier_error_lock, flags);
- /*
- * Basically, the first error is taken, but:
- * -EOPNOTSUPP supersedes any I/O error.
- * Requeue request supersedes any I/O error but -EOPNOTSUPP.
- */
- if (!md->barrier_error || error == -EOPNOTSUPP ||
- (md->barrier_error != -EOPNOTSUPP &&
- error == DM_ENDIO_REQUEUE))
- md->barrier_error = error;
- spin_unlock_irqrestore(&md->barrier_error_lock, flags);
-}
-
/*
* Don't touch any member of the md after calling this function because
* the md may be freed in dm_put() at the end of this function.
@@ -798,13 +761,11 @@ static void free_rq_clone(struct request *clone)
static void dm_end_request(struct request *clone, int error)
{
int rw = rq_data_dir(clone);
- int run_queue = 1;
- bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
rq->errors = clone->errors;
rq->resid_len = clone->resid_len;
@@ -818,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
}
free_rq_clone(clone);
-
- if (unlikely(is_barrier)) {
- if (unlikely(error))
- store_barrier_error(md, error);
- run_queue = 0;
- } else
- blk_end_request_all(rq, error);
-
- rq_completed(md, rw, run_queue);
+ blk_end_request_all(rq, error);
+ rq_completed(md, rw, true);
}
static void dm_unprep_request(struct request *rq)
@@ -851,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone)
struct request_queue *q = rq->q;
unsigned long flags;
- if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
- /*
- * Barrier clones share an original request.
- * Leave it to dm_end_request(), which handles this special
- * case.
- */
- dm_end_request(clone, DM_ENDIO_REQUEUE);
- return;
- }
-
dm_unprep_request(rq);
spin_lock_irqsave(q->queue_lock, flags);
@@ -950,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error)
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
- if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
- /*
- * Barrier clones share an original request. So can't use
- * softirq_done with the original.
- * Pass the clone to dm_done() directly in this special case.
- * It is safe (even if clone->q->queue_lock is held here)
- * because there is no I/O dispatching during the completion
- * of barrier clone.
- */
- dm_done(clone, error, true);
- return;
- }
-
tio->error = error;
rq->completion_data = clone;
blk_complete_request(rq);
@@ -979,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
- if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
- /*
- * Barrier clones share an original request.
- * Leave it to dm_end_request(), which handles this special
- * case.
- */
- BUG_ON(error > 0);
- dm_end_request(clone, error);
- return;
- }
-
rq->cmd_flags |= REQ_FAILED;
dm_complete_request(clone, error);
}
@@ -1019,17 +939,27 @@ static void end_clone_request(struct request *clone, int error)
dm_complete_request(clone, error);
}
-static sector_t max_io_len(struct mapped_device *md,
- sector_t sector, struct dm_target *ti)
+/*
+ * Return maximum size of I/O possible at the supplied sector up to the current
+ * target boundary.
+ */
+static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
{
- sector_t offset = sector - ti->begin;
- sector_t len = ti->len - offset;
+ sector_t target_offset = dm_target_offset(ti, sector);
+
+ return ti->len - target_offset;
+}
+
+static sector_t max_io_len(sector_t sector, struct dm_target *ti)
+{
+ sector_t len = max_io_len_target_boundary(sector, ti);
/*
* Does the target need to split even further ?
*/
if (ti->split_io) {
sector_t boundary;
+ sector_t offset = dm_target_offset(ti, sector);
boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
- offset;
if (len > boundary)
@@ -1098,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio)
}
/*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
*/
static struct bio *split_bvec(struct bio *bio, sector_t sector,
unsigned short idx, unsigned int offset,
@@ -1113,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+ clone->bi_rw = bio->bi_rw;
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
@@ -1140,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
- clone->bi_rw &= ~REQ_HARDBARRIER;
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
@@ -1171,32 +1100,91 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
return tio;
}
-static void __flush_target(struct clone_info *ci, struct dm_target *ti,
- unsigned flush_nr)
+static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
+ unsigned request_nr, sector_t len)
{
struct dm_target_io *tio = alloc_tio(ci, ti);
struct bio *clone;
- tio->info.flush_request = flush_nr;
+ tio->info.target_request_nr = request_nr;
- clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+ /*
+ * Discard requests require the bio's inline iovecs be initialized.
+ * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
+ * and discard, so no need for concern about wasted bvec allocations.
+ */
+ clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
__bio_clone(clone, ci->bio);
clone->bi_destructor = dm_bio_destructor;
+ if (len) {
+ clone->bi_sector = ci->sector;
+ clone->bi_size = to_bytes(len);
+ }
__map_bio(ti, clone, tio);
}
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
+ unsigned num_requests, sector_t len)
{
- unsigned target_nr = 0, flush_nr;
+ unsigned request_nr;
+
+ for (request_nr = 0; request_nr < num_requests; request_nr++)
+ __issue_target_request(ci, ti, request_nr, len);
+}
+
+static int __clone_and_map_empty_flush(struct clone_info *ci)
+{
+ unsigned target_nr = 0;
struct dm_target *ti;
+ BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
- for (flush_nr = 0; flush_nr < ti->num_flush_requests;
- flush_nr++)
- __flush_target(ci, ti, flush_nr);
+ __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
+ return 0;
+}
+
+/*
+ * Perform all io with a single clone.
+ */
+static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
+{
+ struct bio *clone, *bio = ci->bio;
+ struct dm_target_io *tio;
+
+ tio = alloc_tio(ci, ti);
+ clone = clone_bio(bio, ci->sector, ci->idx,
+ bio->bi_vcnt - ci->idx, ci->sector_count,
+ ci->md->bs);
+ __map_bio(ti, clone, tio);
ci->sector_count = 0;
+}
+
+static int __clone_and_map_discard(struct clone_info *ci)
+{
+ struct dm_target *ti;
+ sector_t len;
+
+ do {
+ ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
+
+ /*
+ * Even though the device advertised discard support,
+ * reconfiguration might have changed that since the
+ * check was performed.
+ */
+ if (!ti->num_discard_requests)
+ return -EOPNOTSUPP;
+
+ len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+
+ __issue_target_requests(ci, ti, ti->num_discard_requests, len);
+
+ ci->sector += len;
+ } while (ci->sector_count -= len);
return 0;
}
@@ -1208,30 +1196,21 @@ static int __clone_and_map(struct clone_info *ci)
sector_t len = 0, max;
struct dm_target_io *tio;
- if (unlikely(bio_empty_barrier(bio)))
- return __clone_and_map_empty_barrier(ci);
+ if (unlikely(bio->bi_rw & REQ_DISCARD))
+ return __clone_and_map_discard(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
- max = max_io_len(ci->md, ci->sector, ti);
-
- /*
- * Allocate a target io object.
- */
- tio = alloc_tio(ci, ti);
+ max = max_io_len(ci->sector, ti);
if (ci->sector_count <= max) {
/*
* Optimise for the simple case where we can do all of
* the remaining io with a single clone.
*/
- clone = clone_bio(bio, ci->sector, ci->idx,
- bio->bi_vcnt - ci->idx, ci->sector_count,
- ci->md->bs);
- __map_bio(ti, clone, tio);
- ci->sector_count = 0;
+ __clone_and_map_simple(ci, ti);
} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
/*
@@ -1252,6 +1231,7 @@ static int __clone_and_map(struct clone_info *ci)
len += bv_len;
}
+ tio = alloc_tio(ci, ti);
clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
ci->md->bs);
__map_bio(ti, clone, tio);
@@ -1274,13 +1254,12 @@ static int __clone_and_map(struct clone_info *ci)
if (!dm_target_is_valid(ti))
return -EIO;
- max = max_io_len(ci->md, ci->sector, ti);
-
- tio = alloc_tio(ci, ti);
+ max = max_io_len(ci->sector, ti);
}
len = min(remaining, max);
+ tio = alloc_tio(ci, ti);
clone = split_bvec(bio, ci->sector, ci->idx,
bv->bv_offset + offset, len,
ci->md->bs);
@@ -1308,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
ci.map = dm_get_live_table(md);
if (unlikely(!ci.map)) {
- if (!(bio->bi_rw & REQ_HARDBARRIER))
- bio_io_error(bio);
- else
- if (!md->barrier_error)
- md->barrier_error = -EIO;
+ bio_io_error(bio);
return;
}
ci.md = md;
- ci.bio = bio;
ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
@@ -1325,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
ci.sector = bio->bi_sector;
- ci.sector_count = bio_sectors(bio);
- if (unlikely(bio_empty_barrier(bio)))
- ci.sector_count = 1;
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
- while (ci.sector_count && !error)
- error = __clone_and_map(&ci);
+ if (bio->bi_rw & REQ_FLUSH) {
+ ci.bio = &ci.md->flush_bio;
+ ci.sector_count = 0;
+ error = __clone_and_map_empty_flush(&ci);
+ /* dec_pending submits any data associated with flush */
+ } else {
+ ci.bio = bio;
+ ci.sector_count = bio_sectors(bio);
+ while (ci.sector_count && !error)
+ error = __clone_and_map(&ci);
+ }
/* drop the extra reference count */
dec_pending(ci.io, error);
@@ -1362,7 +1342,7 @@ static int dm_merge_bvec(struct request_queue *q,
/*
* Find maximum amount of I/O that won't need splitting
*/
- max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
+ max_sectors = min(max_io_len(bvm->bi_sector, ti),
(sector_t) BIO_MAX_SECTORS);
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
if (max_size < 0)
@@ -1416,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
part_stat_unlock();
- /*
- * If we're suspended or the thread is processing barriers
- * we have to queue this io for later.
- */
- if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
- unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+ /* if we're suspended, we have to queue this io for later */
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
up_read(&md->io_lock);
- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
- bio_rw(bio) == READA) {
+ if (bio_rw(bio) != READA)
+ queue_io(md, bio);
+ else
bio_io_error(bio);
- return 0;
- }
-
- queue_io(md, bio);
-
return 0;
}
@@ -1462,14 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
return _dm_request(q, bio);
}
-static bool dm_rq_is_flush_request(struct request *rq)
-{
- if (rq->cmd_flags & REQ_FLUSH)
- return true;
- else
- return false;
-}
-
void dm_dispatch_request(struct request *rq)
{
int r;
@@ -1517,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq,
{
int r;
- if (dm_rq_is_flush_request(rq)) {
- blk_rq_init(NULL, clone);
- clone->cmd_type = REQ_TYPE_FS;
- clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
- } else {
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
- dm_rq_bio_constructor, tio);
- if (r)
- return r;
-
- clone->cmd = rq->cmd;
- clone->cmd_len = rq->cmd_len;
- clone->sense = rq->sense;
- clone->buffer = rq->buffer;
- }
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+ dm_rq_bio_constructor, tio);
+ if (r)
+ return r;
+ clone->cmd = rq->cmd;
+ clone->cmd_len = rq->cmd_len;
+ clone->sense = rq->sense;
+ clone->buffer = rq->buffer;
clone->end_io = end_clone_request;
clone->end_io_data = tio;
@@ -1573,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
struct mapped_device *md = q->queuedata;
struct request *clone;
- if (unlikely(dm_rq_is_flush_request(rq)))
- return BLKPREP_OK;
-
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
@@ -1652,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q)
struct dm_table *map = dm_get_live_table(md);
struct dm_target *ti;
struct request *rq, *clone;
+ sector_t pos;
/*
* For suspend, check blk_queue_stopped() and increment
@@ -1664,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q)
if (!rq)
goto plug_and_out;
- if (unlikely(dm_rq_is_flush_request(rq))) {
- BUG_ON(md->flush_request);
- md->flush_request = rq;
- blk_start_request(rq);
- queue_work(md->wq, &md->barrier_work);
- goto out;
- }
+ /* always use block 0 to find the target for flushes for now */
+ pos = 0;
+ if (!(rq->cmd_flags & REQ_FLUSH))
+ pos = blk_rq_pos(rq);
+
+ ti = dm_table_find_target(map, pos);
+ BUG_ON(!dm_target_is_valid(ti));
- ti = dm_table_find_target(map, blk_rq_pos(rq));
if (ti->type->busy && ti->type->busy(ti))
goto plug_and_out;
@@ -1843,7 +1797,29 @@ out:
static const struct block_device_operations dm_blk_dops;
static void dm_wq_work(struct work_struct *work);
-static void dm_rq_barrier_work(struct work_struct *work);
+
+static void dm_init_md_queue(struct mapped_device *md)
+{
+ /*
+ * Request-based dm devices cannot be stacked on top of bio-based dm
+ * devices. The type of this dm device has not been decided yet.
+ * The type is decided at the first table loading time.
+ * To prevent problematic device stacking, clear the queue flag
+ * for request stacking support until then.
+ *
+ * This queue is new, so no concurrency on the queue_flags.
+ */
+ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+
+ md->queue->queuedata = md;
+ md->queue->backing_dev_info.congested_fn = dm_any_congested;
+ md->queue->backing_dev_info.congested_data = md;
+ blk_queue_make_request(md->queue, dm_request);
+ blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
+ md->queue->unplug_fn = dm_unplug_all;
+ blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+ blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
+}
/*
* Allocate and initialise a blank device with a given minor.
@@ -1870,10 +1846,11 @@ static struct mapped_device *alloc_dev(int minor)
if (r < 0)
goto bad_minor;
+ md->type = DM_TYPE_NONE;
init_rwsem(&md->io_lock);
mutex_init(&md->suspend_lock);
+ mutex_init(&md->type_lock);
spin_lock_init(&md->deferred_lock);
- spin_lock_init(&md->barrier_error_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
@@ -1882,33 +1859,11 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->uevent_list);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_init_queue(dm_request_fn, NULL);
+ md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
goto bad_queue;
- /*
- * Request-based dm devices cannot be stacked on top of bio-based dm
- * devices. The type of this dm device has not been decided yet,
- * although we initialized the queue using blk_init_queue().
- * The type is decided at the first table loading time.
- * To prevent problematic device stacking, clear the queue flag
- * for request stacking support until then.
- *
- * This queue is new, so no concurrency on the queue_flags.
- */
- queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
- md->saved_make_request_fn = md->queue->make_request_fn;
- md->queue->queuedata = md;
- md->queue->backing_dev_info.congested_fn = dm_any_congested;
- md->queue->backing_dev_info.congested_data = md;
- blk_queue_make_request(md->queue, dm_request);
- blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
- md->queue->unplug_fn = dm_unplug_all;
- blk_queue_merge_bvec(md->queue, dm_merge_bvec);
- blk_queue_softirq_done(md->queue, dm_softirq_done);
- blk_queue_prep_rq(md->queue, dm_prep_fn);
- blk_queue_lld_busy(md->queue, dm_lld_busy);
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
+ dm_init_md_queue(md);
md->disk = alloc_disk(1);
if (!md->disk)
@@ -1918,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor)
atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
- INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
@@ -1938,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad_bdev;
+ bio_init(&md->flush_bio);
+ md->flush_bio.bi_bdev = md->bdev;
+ md->flush_bio.bi_rw = WRITE_FLUSH;
+
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
@@ -2123,6 +2081,71 @@ int dm_create(int minor, struct mapped_device **result)
return 0;
}
+/*
+ * Functions to manage md->type.
+ * All are required to hold md->type_lock.
+ */
+void dm_lock_md_type(struct mapped_device *md)
+{
+ mutex_lock(&md->type_lock);
+}
+
+void dm_unlock_md_type(struct mapped_device *md)
+{
+ mutex_unlock(&md->type_lock);
+}
+
+void dm_set_md_type(struct mapped_device *md, unsigned type)
+{
+ md->type = type;
+}
+
+unsigned dm_get_md_type(struct mapped_device *md)
+{
+ return md->type;
+}
+
+/*
+ * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
+ */
+static int dm_init_request_based_queue(struct mapped_device *md)
+{
+ struct request_queue *q = NULL;
+
+ if (md->queue->elevator)
+ return 1;
+
+ /* Fully initialize the queue */
+ q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
+ if (!q)
+ return 0;
+
+ md->queue = q;
+ md->saved_make_request_fn = md->queue->make_request_fn;
+ dm_init_md_queue(md);
+ blk_queue_softirq_done(md->queue, dm_softirq_done);
+ blk_queue_prep_rq(md->queue, dm_prep_fn);
+ blk_queue_lld_busy(md->queue, dm_lld_busy);
+
+ elv_register_queue(md->queue);
+
+ return 1;
+}
+
+/*
+ * Setup the DM device's queue based on md's type
+ */
+int dm_setup_md_queue(struct mapped_device *md)
+{
+ if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
+ !dm_init_request_based_queue(md)) {
+ DMWARN("Cannot initialize queue for request-based mapped device");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct mapped_device *dm_find_md(dev_t dev)
{
struct mapped_device *md;
@@ -2136,6 +2159,7 @@ static struct mapped_device *dm_find_md(dev_t dev)
md = idr_find(&_minor_idr, minor);
if (md && (md == MINOR_ALLOCED ||
(MINOR(disk_devt(dm_disk(md))) != minor) ||
+ dm_deleting_md(md) ||
test_bit(DMF_FREEING, &md->flags))) {
md = NULL;
goto out;
@@ -2170,6 +2194,7 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr)
void dm_get(struct mapped_device *md)
{
atomic_inc(&md->holders);
+ BUG_ON(test_bit(DMF_FREEING, &md->flags));
}
const char *dm_device_name(struct mapped_device *md)
@@ -2178,27 +2203,55 @@ const char *dm_device_name(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_device_name);
-void dm_put(struct mapped_device *md)
+static void __dm_destroy(struct mapped_device *md, bool wait)
{
struct dm_table *map;
- BUG_ON(test_bit(DMF_FREEING, &md->flags));
+ might_sleep();
- if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
- map = dm_get_live_table(md);
- idr_replace(&_minor_idr, MINOR_ALLOCED,
- MINOR(disk_devt(dm_disk(md))));
- set_bit(DMF_FREEING, &md->flags);
- spin_unlock(&_minor_lock);
- if (!dm_suspended_md(md)) {
- dm_table_presuspend_targets(map);
- dm_table_postsuspend_targets(map);
- }
- dm_sysfs_exit(md);
- dm_table_put(map);
- dm_table_destroy(__unbind(md));
- free_dev(md);
+ spin_lock(&_minor_lock);
+ map = dm_get_live_table(md);
+ idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
+ set_bit(DMF_FREEING, &md->flags);
+ spin_unlock(&_minor_lock);
+
+ if (!dm_suspended_md(md)) {
+ dm_table_presuspend_targets(map);
+ dm_table_postsuspend_targets(map);
}
+
+ /*
+ * Rare, but there may be I/O requests still going to complete,
+ * for example. Wait for all references to disappear.
+ * No one should increment the reference count of the mapped_device,
+ * after the mapped_device state becomes DMF_FREEING.
+ */
+ if (wait)
+ while (atomic_read(&md->holders))
+ msleep(1);
+ else if (atomic_read(&md->holders))
+ DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
+ dm_device_name(md), atomic_read(&md->holders));
+
+ dm_sysfs_exit(md);
+ dm_table_put(map);
+ dm_table_destroy(__unbind(md));
+ free_dev(md);
+}
+
+void dm_destroy(struct mapped_device *md)
+{
+ __dm_destroy(md, true);
+}
+
+void dm_destroy_immediate(struct mapped_device *md)
+{
+ __dm_destroy(md, false);
+}
+
+void dm_put(struct mapped_device *md)
+{
+ atomic_dec(&md->holders);
}
EXPORT_SYMBOL_GPL(dm_put);
@@ -2233,38 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
return r;
}
-static void dm_flush(struct mapped_device *md)
-{
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
- bio_init(&md->barrier_bio);
- md->barrier_bio.bi_bdev = md->bdev;
- md->barrier_bio.bi_rw = WRITE_BARRIER;
- __split_and_process_bio(md, &md->barrier_bio);
-
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
-
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
- md->barrier_error = 0;
-
- dm_flush(md);
-
- if (!bio_empty_barrier(bio)) {
- __split_and_process_bio(md, bio);
- dm_flush(md);
- }
-
- if (md->barrier_error != DM_ENDIO_REQUEUE)
- bio_endio(bio, md->barrier_error);
- else {
- spin_lock_irq(&md->deferred_lock);
- bio_list_add_head(&md->deferred, bio);
- spin_unlock_irq(&md->deferred_lock);
- }
-}
-
/*
* Process the deferred bios
*/
@@ -2274,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work)
work);
struct bio *c;
- down_write(&md->io_lock);
+ down_read(&md->io_lock);
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
c = bio_list_pop(&md->deferred);
spin_unlock_irq(&md->deferred_lock);
- if (!c) {
- clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ if (!c)
break;
- }
- up_write(&md->io_lock);
+ up_read(&md->io_lock);
if (dm_request_based(md))
generic_make_request(c);
- else {
- if (c->bi_rw & REQ_HARDBARRIER)
- process_barrier(md, c);
- else
- __split_and_process_bio(md, c);
- }
+ else
+ __split_and_process_bio(md, c);
- down_write(&md->io_lock);
+ down_read(&md->io_lock);
}
- up_write(&md->io_lock);
+ up_read(&md->io_lock);
}
static void dm_queue_flush(struct mapped_device *md)
@@ -2310,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md)
queue_work(md->wq, &md->work);
}
-static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
-{
- struct dm_rq_target_io *tio = clone->end_io_data;
-
- tio->info.flush_request = flush_nr;
-}
-
-/* Issue barrier requests to targets and wait for their completion. */
-static int dm_rq_barrier(struct mapped_device *md)
-{
- int i, j;
- struct dm_table *map = dm_get_live_table(md);
- unsigned num_targets = dm_table_get_num_targets(map);
- struct dm_target *ti;
- struct request *clone;
-
- md->barrier_error = 0;
-
- for (i = 0; i < num_targets; i++) {
- ti = dm_table_get_target(map, i);
- for (j = 0; j < ti->num_flush_requests; j++) {
- clone = clone_rq(md->flush_request, md, GFP_NOIO);
- dm_rq_set_flush_nr(clone, j);
- atomic_inc(&md->pending[rq_data_dir(clone)]);
- map_request(ti, clone, md);
- }
- }
-
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- dm_table_put(map);
-
- return md->barrier_error;
-}
-
-static void dm_rq_barrier_work(struct work_struct *work)
-{
- int error;
- struct mapped_device *md = container_of(work, struct mapped_device,
- barrier_work);
- struct request_queue *q = md->queue;
- struct request *rq;
- unsigned long flags;
-
- /*
- * Hold the md reference here and leave it at the last part so that
- * the md can't be deleted by device opener when the barrier request
- * completes.
- */
- dm_get(md);
-
- error = dm_rq_barrier(md);
-
- rq = md->flush_request;
- md->flush_request = NULL;
-
- if (error == DM_ENDIO_REQUEUE) {
- spin_lock_irqsave(q->queue_lock, flags);
- blk_requeue_request(q, rq);
- spin_unlock_irqrestore(q->queue_lock, flags);
- } else
- blk_end_request_all(rq, error);
-
- blk_run_queue(q);
-
- dm_put(md);
-}
-
/*
* Swap in a new table, returning the old one for the caller to destroy.
*/
@@ -2398,13 +2346,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
goto out;
}
- /* cannot change the device type, once a table is bound */
- if (md->map &&
- (dm_table_get_type(md->map) != dm_table_get_type(table))) {
- DMWARN("can't change the device type after a table is bound");
- goto out;
- }
-
map = __bind(md, table, &limits);
out:
@@ -2506,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
*
* To get all processes out of __split_and_process_bio in dm_request,
* we take the write lock. To prevent any process from reentering
- * __split_and_process_bio from dm_request, we set
- * DMF_QUEUE_IO_TO_THREAD.
- *
- * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
- * and call flush_workqueue(md->wq). flush_workqueue will wait until
- * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
- * further calls to __split_and_process_bio from dm_wq_work.
+ * __split_and_process_bio from dm_request and quiesce the thread
+ * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+ * flush_workqueue(md->wq).
*/
down_write(&md->io_lock);
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
/*
- * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
- * can be kicked until md->queue is stopped. So stop md->queue before
- * flushing md->wq.
+ * Stop md->queue before flushing md->wq in case request-based
+ * dm defers requests to md->wq from md->queue.
*/
if (dm_request_based(md))
stop_queue(md->queue);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index bad1724d486..0c2dd5f4af7 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -59,13 +59,20 @@ void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
int dm_table_any_busy_target(struct dm_table *t);
-int dm_table_set_type(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
+bool dm_table_supports_discards(struct dm_table *t);
int dm_table_alloc_md_mempools(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+
+int dm_setup_md_queue(struct mapped_device *md);
+
/*
* To check the return value from dm_table_find_target().
*/
@@ -122,6 +129,11 @@ void dm_linear_exit(void);
int dm_stripe_init(void);
void dm_stripe_exit(void);
+/*
+ * mapped_device operations
+ */
+void dm_destroy(struct mapped_device *md);
+void dm_destroy_immediate(struct mapped_device *md);
int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3..8a2f767f26d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
dev_info_t *tmp_dev;
sector_t start_sector;
- if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
- md_barrier_request(mddev, bio);
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 11567c7999a..225815197a3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,7 +36,7 @@
#include <linux/blkdev.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h>
#include <linux/ctype.h>
@@ -57,6 +57,7 @@
#define DEBUG 0
#define dprintk(x...) ((void)(DEBUG && printk(x)))
+static DEFINE_MUTEX(md_mutex);
#ifndef MODULE
static void autostart_arrays(int part);
@@ -226,12 +227,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
return 0;
}
rcu_read_lock();
- if (mddev->suspended || mddev->barrier) {
+ if (mddev->suspended) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- if (!mddev->suspended && !mddev->barrier)
+ if (!mddev->suspended)
break;
rcu_read_unlock();
schedule();
@@ -282,40 +283,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
int mddev_congested(mddev_t *mddev, int bits)
{
- if (mddev->barrier)
- return 1;
return mddev->suspended;
}
EXPORT_SYMBOL(mddev_congested);
/*
- * Generic barrier handling for md
+ * Generic flush handling for md
*/
-#define POST_REQUEST_BARRIER ((void*)1)
-
-static void md_end_barrier(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio, int err)
{
mdk_rdev_t *rdev = bio->bi_private;
mddev_t *mddev = rdev->mddev;
- if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
- set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
- if (mddev->barrier == POST_REQUEST_BARRIER) {
- /* This was a post-request barrier */
- mddev->barrier = NULL;
- wake_up(&mddev->sb_wait);
- } else
- /* The pre-request barrier has finished */
- schedule_work(&mddev->barrier_work);
+ /* The pre-request flush has finished */
+ schedule_work(&mddev->flush_work);
}
bio_put(bio);
}
-static void submit_barriers(mddev_t *mddev)
+static void submit_flushes(mddev_t *mddev)
{
mdk_rdev_t *rdev;
@@ -332,60 +322,56 @@ static void submit_barriers(mddev_t *mddev)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
bi = bio_alloc(GFP_KERNEL, 0);
- bi->bi_end_io = md_end_barrier;
+ bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev;
atomic_inc(&mddev->flush_pending);
- submit_bio(WRITE_BARRIER, bi);
+ submit_bio(WRITE_FLUSH, bi);
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
}
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
{
- mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
- struct bio *bio = mddev->barrier;
+ mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+ struct bio *bio = mddev->flush_bio;
atomic_set(&mddev->flush_pending, 1);
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- bio_endio(bio, -EOPNOTSUPP);
- else if (bio->bi_size == 0)
+ if (bio->bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio, 0);
else {
- bio->bi_rw &= ~REQ_HARDBARRIER;
+ bio->bi_rw &= ~REQ_FLUSH;
if (mddev->pers->make_request(mddev, bio))
generic_make_request(bio);
- mddev->barrier = POST_REQUEST_BARRIER;
- submit_barriers(mddev);
}
if (atomic_dec_and_test(&mddev->flush_pending)) {
- mddev->barrier = NULL;
+ mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait);
}
}
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
{
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
- !mddev->barrier,
+ !mddev->flush_bio,
mddev->write_lock, /*nothing*/);
- mddev->barrier = bio;
+ mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);
atomic_set(&mddev->flush_pending, 1);
- INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
- submit_barriers(mddev);
+ submit_flushes(mddev);
if (atomic_dec_and_test(&mddev->flush_pending))
- schedule_work(&mddev->barrier_work);
+ schedule_work(&mddev->flush_work);
}
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
@@ -696,31 +682,6 @@ static void super_written(struct bio *bio, int error)
bio_put(bio);
}
-static void super_written_barrier(struct bio *bio, int error)
-{
- struct bio *bio2 = bio->bi_private;
- mdk_rdev_t *rdev = bio2->bi_private;
- mddev_t *mddev = rdev->mddev;
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
- error == -EOPNOTSUPP) {
- unsigned long flags;
- /* barriers don't appear to be supported :-( */
- set_bit(BarriersNotsupp, &rdev->flags);
- mddev->barriers_work = 0;
- spin_lock_irqsave(&mddev->write_lock, flags);
- bio2->bi_next = mddev->biolist;
- mddev->biolist = bio2;
- spin_unlock_irqrestore(&mddev->write_lock, flags);
- wake_up(&mddev->sb_wait);
- bio_put(bio);
- } else {
- bio_put(bio2);
- bio->bi_private = rdev;
- super_written(bio, error);
- }
-}
-
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page)
{
@@ -729,51 +690,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
* and decrement it on completion, waking up sb_wait
* if zero is reached.
* If an error occurred, call md_error
- *
- * As we might need to resubmit the request if REQ_HARDBARRIER
- * causes ENOTSUPP, we allocate a spare bio...
*/
struct bio *bio = bio_alloc(GFP_NOIO, 1);
- int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
- bio->bi_rw = rw;
atomic_inc(&mddev->pending_writes);
- if (!test_bit(BarriersNotsupp, &rdev->flags)) {
- struct bio *rbio;
- rw |= REQ_HARDBARRIER;
- rbio = bio_clone(bio, GFP_NOIO);
- rbio->bi_private = bio;
- rbio->bi_end_io = super_written_barrier;
- submit_bio(rw, rbio);
- } else
- submit_bio(rw, bio);
+ submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+ bio);
}
void md_super_wait(mddev_t *mddev)
{
- /* wait for all superblock writes that were scheduled to complete.
- * if any had to be retried (due to BARRIER problems), retry them
- */
+ /* wait for all superblock writes that were scheduled to complete */
DEFINE_WAIT(wq);
for(;;) {
prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
if (atomic_read(&mddev->pending_writes)==0)
break;
- while (mddev->biolist) {
- struct bio *bio;
- spin_lock_irq(&mddev->write_lock);
- bio = mddev->biolist;
- mddev->biolist = bio->bi_next ;
- bio->bi_next = NULL;
- spin_unlock_irq(&mddev->write_lock);
- submit_bio(bio->bi_rw, bio);
- }
schedule();
}
finish_wait(&mddev->sb_wait, &wq);
@@ -1070,7 +1008,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 0;
@@ -1485,7 +1422,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
@@ -1643,7 +1579,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
if (rdev->sb_size & bmask)
rdev->sb_size = (rdev->sb_size | bmask) + 1;
- }
+ } else
+ max_dev = le32_to_cpu(sb->max_dev);
+
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -2136,16 +2074,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
* with the rest of the array)
*/
mdk_rdev_t *rdev;
-
- /* First make sure individual recovery_offsets are correct */
- list_for_each_entry(rdev, &mddev->disks, same_set) {
- if (rdev->raid_disk >= 0 &&
- mddev->delta_disks >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- mddev->curr_resync_completed > rdev->recovery_offset)
- rdev->recovery_offset = mddev->curr_resync_completed;
-
- }
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->sb_events == mddev->events ||
(nospares &&
@@ -2167,13 +2095,27 @@ static void md_update_sb(mddev_t * mddev, int force_change)
int sync_req;
int nospares = 0;
- mddev->utime = get_seconds();
- if (mddev->external)
- return;
repeat:
+ /* First make sure individual recovery_offsets are correct */
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->raid_disk >= 0 &&
+ mddev->delta_disks >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ mddev->curr_resync_completed > rdev->recovery_offset)
+ rdev->recovery_offset = mddev->curr_resync_completed;
+
+ }
+ if (!mddev->persistent) {
+ clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ clear_bit(MD_CHANGE_DEVS, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ return;
+ }
+
spin_lock_irq(&mddev->write_lock);
- set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ mddev->utime = get_seconds();
+
if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
force_change = 1;
if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
@@ -2221,19 +2163,6 @@ repeat:
MD_BUG();
mddev->events --;
}
-
- /*
- * do not write anything to disk if using
- * nonpersistent superblocks
- */
- if (!mddev->persistent) {
- if (!mddev->external)
- clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-
- spin_unlock_irq(&mddev->write_lock);
- wake_up(&mddev->sb_wait);
- return;
- }
sync_sbs(mddev, nospares);
spin_unlock_irq(&mddev->write_lock);
@@ -3379,7 +3308,7 @@ array_state_show(mddev_t *mddev, char *page)
case 0:
if (mddev->in_sync)
st = clean;
- else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
st = write_pending;
else if (mddev->safemode)
st = active_idle;
@@ -3460,9 +3389,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
mddev->in_sync = 1;
if (mddev->safemode == 1)
mddev->safemode = 0;
- if (mddev->persistent)
- set_bit(MD_CHANGE_CLEAN,
- &mddev->flags);
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
err = 0;
} else
@@ -3474,8 +3401,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case active:
if (mddev->pers) {
restart_array(mddev);
- if (mddev->external)
- clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
} else {
@@ -4514,7 +4440,6 @@ int md_run(mddev_t *mddev)
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
- mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && mddev->ro == 0)
@@ -4693,7 +4618,6 @@ static void md_clean(mddev_t *mddev)
mddev->recovery = 0;
mddev->in_sync = 0;
mddev->degraded = 0;
- mddev->barriers_work = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
@@ -5961,7 +5885,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
mddev_t *mddev = mddev_find(bdev->bd_dev);
int err;
- lock_kernel();
+ mutex_lock(&md_mutex);
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
@@ -5970,7 +5894,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
/* Wait until bdev->bd_disk is definitely gone */
flush_scheduled_work();
/* Then retry the open from the top */
- unlock_kernel();
+ mutex_unlock(&md_mutex);
return -ERESTARTSYS;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5984,7 +5908,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
check_disk_size_change(mddev->gendisk, bdev);
out:
- unlock_kernel();
+ mutex_unlock(&md_mutex);
return err;
}
@@ -5993,10 +5917,10 @@ static int md_release(struct gendisk *disk, fmode_t mode)
mddev_t *mddev = disk->private_data;
BUG_ON(!mddev);
- lock_kernel();
+ mutex_lock(&md_mutex);
atomic_dec(&mddev->openers);
mddev_put(mddev);
- unlock_kernel();
+ mutex_unlock(&md_mutex);
return 0;
}
@@ -6580,6 +6504,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
md_wakeup_thread(mddev->thread);
did_change = 1;
}
@@ -6588,7 +6513,6 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
wait_event(mddev->sb_wait,
- !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
}
@@ -6624,6 +6548,7 @@ int md_allow_write(mddev_t *mddev)
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
if (mddev->safemode_delay &&
mddev->safemode == 0)
mddev->safemode = 1;
@@ -6633,7 +6558,7 @@ int md_allow_write(mddev_t *mddev)
} else
spin_unlock_irq(&mddev->write_lock);
- if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
return -EAGAIN;
else
return 0;
@@ -6831,8 +6756,7 @@ void md_do_sync(mddev_t *mddev)
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed =
mddev->curr_resync;
- if (mddev->persistent)
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -7081,7 +7005,7 @@ void md_check_recovery(mddev_t *mddev)
if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return;
if ( ! (
- (mddev->flags && !mddev->external) ||
+ (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->external == 0 && mddev->safemode == 1) ||
@@ -7111,8 +7035,7 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
did_change = 1;
- if (mddev->persistent)
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
if (mddev->safemode == 1)
mddev->safemode = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a953fe2808a..112a2c32db0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
-#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
#define AllReserved 6 /* If whole device is reserved for
* one array */
#define AutoDetected 7 /* added by auto-detect */
@@ -140,7 +139,7 @@ struct mddev_s
unsigned long flags;
#define MD_CHANGE_DEVS 0 /* Some device status has changed */
#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2 /* superblock update in progress */
+#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
int suspended;
atomic_t active_io;
@@ -273,13 +272,6 @@ struct mddev_s
int degraded; /* whether md should consider
* adding a spare
*/
- int barriers_work; /* initialised to true, cleared as soon
- * as a barrier request to slave
- * fails. Only supported
- */
- struct bio *biolist; /* bios that need to be retried
- * because REQ_HARDBARRIER is not supported
- */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
struct attribute_group *to_remove;
struct plug_handle *plug; /* if used by personality */
- /* Generic barrier handling.
- * If there is a pending barrier request, all other
- * writes are blocked while the devices are flushed.
- * The last to finish a flush schedules a worker to
- * submit the barrier request (without the barrier flag),
- * then submit more flush requests.
+ /* Generic flush handling.
+ * The last to finish preflush schedules a worker to submit
+ * the rest of the request (without the REQ_FLUSH flag).
*/
- struct bio *barrier;
+ struct bio *flush_bio;
atomic_t flush_pending;
- struct work_struct barrier_work;
+ struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
};
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
extern int mddev_congested(mddev_t *mddev, int bits);
-extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
+extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a..6d7ddf32ef2 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
- if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
- md_barrier_request(mddev, bio);
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
return 0;
}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623..a39f4c355e5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
struct strip_zone *zone;
mdk_rdev_t *tmp_dev;
- if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
- md_barrier_request(mddev, bio);
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
return 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 73cc74ffc26..378a25894c5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
if (r1_bio->bios[mirror] == bio)
break;
- if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
- set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
- set_bit(R1BIO_BarrierRetry, &r1_bio->state);
- r1_bio->mddev->barriers_work = 0;
- /* Don't rdev_dec_pending in this branch - keep it for the retry */
- } else {
+ /*
+ * 'one mirror IO has finished' event handler:
+ */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ if (!uptodate) {
+ md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
/*
- * this branch is our 'one mirror IO has finished' event handler:
+ * Set R1BIO_Uptodate in our master bio, so that we
+ * will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer
+ * fails.
+ *
+ * The 'master' represents the composite IO operation
+ * to user-side. So if something waits for IO, then it
+ * will wait for the 'master' bio.
*/
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- if (!uptodate) {
- md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
- /* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- } else
- /*
- * Set R1BIO_Uptodate in our master bio, so that
- * we will return a good error code for to the higher
- * levels even if IO on some other mirrored buffer fails.
- *
- * The 'master' represents the composite IO operation to
- * user-side. So if something waits for IO, then it will
- * wait for the 'master' bio.
- */
- set_bit(R1BIO_Uptodate, &r1_bio->state);
-
- update_head_pos(mirror, r1_bio);
-
- if (behind) {
- if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
- atomic_dec(&r1_bio->behind_remaining);
-
- /* In behind mode, we ACK the master bio once the I/O has safely
- * reached all non-writemostly disks. Setting the Returned bit
- * ensures that this gets done only once -- we don't ever want to
- * return -EIO here, instead we'll wait */
-
- if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
- test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- /* Maybe we can return now */
- if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
- struct bio *mbio = r1_bio->master_bio;
- PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
- (unsigned long long) mbio->bi_sector,
- (unsigned long long) mbio->bi_sector +
- (mbio->bi_size >> 9) - 1);
- bio_endio(mbio, 0);
- }
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ update_head_pos(mirror, r1_bio);
+
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /*
+ * In behind mode, we ACK the master bio once the I/O
+ * has safely reached all non-writemostly
+ * disks. Setting the Returned bit ensures that this
+ * gets done only once -- we don't ever want to return
+ * -EIO here, instead we'll wait
+ */
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+ (unsigned long long) mbio->bi_sector,
+ (unsigned long long) mbio->bi_sector +
+ (mbio->bi_size >> 9) - 1);
+ bio_endio(mbio, 0);
}
}
- rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+
/*
- *
* Let's see if all mirrored write operations have finished
* already.
*/
if (atomic_dec_and_test(&r1_bio->remaining)) {
- if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- /* it really is the end of this request */
- if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = bio->bi_vcnt;
- while (i--)
- safe_put_page(bio->bi_io_vec[i].bv_page);
- }
- /* clear the bitmap if all writes complete successfully */
- bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
- r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- behind);
- md_write_end(r1_bio->mddev);
- raid_end_bio_io(r1_bio);
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = bio->bi_vcnt;
+ while (i--)
+ safe_put_page(bio->bi_io_vec[i].bv_page);
}
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ behind);
+ md_write_end(r1_bio->mddev);
+ raid_end_bio_io(r1_bio);
}
if (to_put)
@@ -787,17 +778,14 @@ static int make_request(mddev_t *mddev, struct bio * bio)
struct bio_list bl;
struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio);
- const bool do_sync = (bio->bi_rw & REQ_SYNC);
- bool do_barriers;
+ const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
- * We test barriers_work *after* md_write_start as md_write_start
- * may cause the first superblock write, and that will check out
- * if barriers work.
*/
md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
}
finish_wait(&conf->wait_barrier, &w);
}
- if (unlikely(!mddev->barriers_work &&
- (bio->bi_rw & REQ_HARDBARRIER))) {
- if (rw == WRITE)
- md_write_end(mddev);
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
- }
wait_barrier(conf);
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
atomic_set(&r1_bio->remaining, 0);
atomic_set(&r1_bio->behind_remaining, 0);
- do_barriers = bio->bi_rw & REQ_HARDBARRIER;
- if (do_barriers)
- set_bit(R1BIO_Barrier, &r1_bio->state);
-
bio_list_init(&bl);
for (i = 0; i < disks; i++) {
struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_barriers | do_sync;
+ mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio;
if (behind_pages) {
@@ -1120,6 +1097,8 @@ static int raid1_spare_active(mddev_t *mddev)
{
int i;
conf_t *conf = mddev->private;
+ int count = 0;
+ unsigned long flags;
/*
* Find all failed disks within the RAID1 configuration
@@ -1131,15 +1110,16 @@ static int raid1_spare_active(mddev_t *mddev)
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded--;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ count++;
+ sysfs_notify_dirent(rdev->sysfs_state);
}
}
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
print_conf(conf);
- return 0;
+ return count;
}
@@ -1631,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
sync_request_write(mddev, r1_bio);
unplug = 1;
- } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
- /* some requests in the r1bio were REQ_HARDBARRIER
- * requests which failed with -EOPNOTSUPP. Hohumm..
- * Better resubmit without the barrier.
- * We know which devices to resubmit for, because
- * all others have had their bios[] entry cleared.
- * We already have a nr_pending reference on these rdevs.
- */
- int i;
- const bool do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
- clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
- clear_bit(R1BIO_Barrier, &r1_bio->state);
- for (i=0; i < conf->raid_disks; i++)
- if (r1_bio->bios[i])
- atomic_inc(&r1_bio->remaining);
- for (i=0; i < conf->raid_disks; i++)
- if (r1_bio->bios[i]) {
- struct bio_vec *bvec;
- int j;
-
- bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
- /* copy pages from the failed bio, as
- * this might be a write-behind device */
- __bio_for_each_segment(bvec, bio, j, 0)
- bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
- bio_put(r1_bio->bios[i]);
- bio->bi_sector = r1_bio->sector +
- conf->mirrors[i].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[i].rdev->bdev;
- bio->bi_end_io = raid1_end_write_request;
- bio->bi_rw = WRITE | do_sync;
- bio->bi_private = r1_bio;
- r1_bio->bios[i] = bio;
- generic_make_request(bio);
- }
} else {
int disk;
@@ -1696,7 +1641,7 @@ static void raid1d(mddev_t *mddev)
(unsigned long long)r1_bio->sector);
raid_end_bio_io(r1_bio);
} else {
- const bool do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
+ const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
r1_bio->bios[r1_bio->read_disk] =
mddev->ro ? IO_BLOCKED : NULL;
r1_bio->read_disk = disk;
@@ -1836,7 +1781,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
/* take from bio_init */
bio->bi_next = NULL;
+ bio->bi_flags &= ~(BIO_POOL_MASK-1);
bio->bi_flags |= 1 << BIO_UPTODATE;
+ bio->bi_comp_cpu = -1;
bio->bi_rw = READ;
bio->bi_vcnt = 0;
bio->bi_idx = 0;
@@ -1909,7 +1856,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
BUG_ON(sync_blocks < (PAGE_SIZE>>9));
- if (len > (sync_blocks<<9))
+ if ((len >> 9) > sync_blocks)
len = sync_blocks<<9;
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28..adf8cfd7331 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
#define R1BIO_IsSync 1
#define R1BIO_Degraded 2
#define R1BIO_BehindIO 3
-#define R1BIO_Barrier 4
-#define R1BIO_BarrierRetry 5
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a88aeb5198c..f0d082f749b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -799,13 +799,14 @@ static int make_request(mddev_t *mddev, struct bio * bio)
int i;
int chunk_sects = conf->chunk_mask + 1;
const int rw = bio_data_dir(bio);
- const bool do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
struct bio_list bl;
unsigned long flags;
mdk_rdev_t *blocked_rdev;
- if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
- md_barrier_request(mddev, bio);
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
return 0;
}
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
conf->mirrors[d].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync;
+ mbio->bi_rw = WRITE | do_sync | do_fua;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1116,6 +1117,8 @@ static int raid10_spare_active(mddev_t *mddev)
int i;
conf_t *conf = mddev->private;
mirror_info_t *tmp;
+ int count = 0;
+ unsigned long flags;
/*
* Find all non-in_sync disks within the RAID10 configuration
@@ -1126,15 +1129,16 @@ static int raid10_spare_active(mddev_t *mddev)
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded--;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ count++;
+ sysfs_notify_dirent(tmp->rdev->sysfs_state);
}
}
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
print_conf(conf);
- return 0;
+ return count;
}
@@ -1734,7 +1738,7 @@ static void raid10d(mddev_t *mddev)
raid_end_bio_io(r10_bio);
bio_put(bio);
} else {
- const bool do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
+ const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
bio_put(bio);
rdev = conf->mirrors[mirror].rdev;
if (printk_ratelimit())
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 866d4b5a144..31140d1259d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
int rw;
struct bio *bi;
mdk_rdev_t *rdev;
- if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
- rw = WRITE;
- else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+ if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
+ if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
+ rw = WRITE_FUA;
+ else
+ rw = WRITE;
+ } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
else
continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
+ if (wbi->bi_rw & REQ_FUA)
+ set_bit(R5_WantFUA, &dev->flags);
tx = async_copy_data(1, wbi, dev->page,
dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
int i;
+ bool fua = false;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ for (i = disks; i--; )
+ fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
+
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->written || i == pd_idx || i == qd_idx)
+ if (dev->written || i == pd_idx || i == qd_idx) {
set_bit(R5_UPTODATE, &dev->flags);
+ if (fua)
+ set_bit(R5_WantFUA, &dev->flags);
+ }
}
if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
if (dec_preread_active) {
/* We delay this until after ops_run_io so that if make_request
- * is waiting on a barrier, it won't continue until the writes
+ * is waiting on a flush, it won't continue until the writes
* have actually been submitted.
*/
atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
if (dec_preread_active) {
/* We delay this until after ops_run_io so that if make_request
- * is waiting on a barrier, it won't continue until the writes
+ * is waiting on a flush, it won't continue until the writes
* have actually been submitted.
*/
atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
const int rw = bio_data_dir(bi);
int remaining;
- if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {
- /* Drain all pending writes. We only really need
- * to ensure they have been submitted, but this is
- * easier.
- */
- mddev->pers->quiesce(mddev, 1);
- mddev->pers->quiesce(mddev, 0);
- md_barrier_request(mddev, bi);
+ if (unlikely(bi->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bi);
return 0;
}
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
finish_wait(&conf->wait_for_overlap, &w);
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
- if (mddev->barrier &&
+ if ((bi->bi_rw & REQ_SYNC) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
bio_endio(bi, 0);
}
- if (mddev->barrier) {
- /* We need to wait for the stripes to all be handled.
- * So: wait for preread_active_stripes to drop to 0.
- */
- wait_event(mddev->thread->wqueue,
- atomic_read(&conf->preread_active_stripes) == 0);
- }
return 0;
}
@@ -5330,6 +5329,8 @@ static int raid5_spare_active(mddev_t *mddev)
int i;
raid5_conf_t *conf = mddev->private;
struct disk_info *tmp;
+ int count = 0;
+ unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
@@ -5337,14 +5338,15 @@ static int raid5_spare_active(mddev_t *mddev)
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded--;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ count++;
+ sysfs_notify_dirent(tmp->rdev->sysfs_state);
}
}
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
print_raid5_conf(conf);
- return 0;
+ return count;
}
static int raid5_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6..2ace0582b40 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
* filling
*/
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
+#define R5_WantFUA 14 /* Write should be FUA */
/*
* Write method
*/