From 5e3283e2920a0bd8a806964d80274b8756e0dd7f Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 8 Apr 2014 11:08:41 +0100
Subject: dm thin: irqsave must always be used with the pool->lock spinlock

Commit c140e1c4e23 ("dm thin: use per thin device deferred bio lists")
incorrectly stopped disabling irqs when taking the pool's spinlock.

Irqs must be disabled when taking the pool's spinlock otherwise a thread
could spin_lock(), then get interrupted to service thin_endio() in
interrupt context, which would then deadlock in spin_lock_irqsave().

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 53728be84de..ae5fd0b9c75 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3101,6 +3101,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	struct thin_c *tc;
 	struct dm_dev *pool_dev, *origin_dev;
 	struct mapped_device *pool_md;
+	unsigned long flags;
 
 	mutex_lock(&dm_thin_pool_table.mutex);
 
@@ -3191,9 +3192,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
 
-	spin_lock(&tc->pool->lock);
+	spin_lock_irqsave(&tc->pool->lock, flags);
 	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
-	spin_unlock(&tc->pool->lock);
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
 	/*
 	 * This synchronize_rcu() call is needed here otherwise we risk a
 	 * wake_worker() call finding no bios to process (because the newly
-- 
cgit v1.2.3-70-g09d2


From b10ebd34cccae1b431caf1be54919aede2be7cbe Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 8 Apr 2014 11:29:01 +0100
Subject: dm thin: fix rcu_read_lock being held in code that can sleep

Commit c140e1c4e23 ("dm thin: use per thin device deferred bio lists")
introduced the use of an rculist for all active thin devices.  The use
of rcu_read_lock() in process_deferred_bios() can result in a BUG if a
dm_bio_prison_cell must be allocated as a side-effect of bio_detain():

 BUG: sleeping function called from invalid context at mm/mempool.c:203
 in_atomic(): 1, irqs_disabled(): 0, pid: 6, name: kworker/u8:0
 3 locks held by kworker/u8:0/6:
   #0:  ("dm-" "thin"){.+.+..}, at: [<ffffffff8106be42>] process_one_work+0x192/0x550
   #1:  ((&pool->worker)){+.+...}, at: [<ffffffff8106be42>] process_one_work+0x192/0x550
   #2:  (rcu_read_lock){.+.+..}, at: [<ffffffff816360b5>] do_worker+0x5/0x4d0

We can't process deferred bios with the rcu lock held, since
dm_bio_prison_cell allocation may block if the bio-prison's cell mempool
is exhausted.

To fix:

- Introduce a refcount and completion field to each thin_c

- Add thin_get/put methods for adjusting the refcount.  If the refcount
  hits zero then the completion is triggered.

- Initialise refcount to 1 when creating thin_c

- When iterating the active_thins list we thin_get() whilst the rcu
  lock is held.

- After the rcu lock is dropped we process the deferred bios for that
  thin.

- When destroying a thin_c we thin_put() and then wait for the
  completion -- to avoid a race between the worker thread iterating
  from that thin_c and destroying the thin_c.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 67 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ae5fd0b9c75..28fc282b61b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -232,6 +232,13 @@ struct thin_c {
 	struct bio_list deferred_bio_list;
 	struct bio_list retry_on_resume_list;
 	struct rb_root sort_bio_list; /* sorted list of deferred bios */
+
+	/*
+	 * Ensures the thin is not destroyed until the worker has finished
+	 * iterating the active_thins list.
+	 */
+	atomic_t refcount;
+	struct completion can_destroy;
 };
 
 /*----------------------------------------------------------------*/
@@ -1486,6 +1493,45 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	blk_finish_plug(&plug);
 }
 
+static void thin_get(struct thin_c *tc);
+static void thin_put(struct thin_c *tc);
+
+/*
+ * We can't hold rcu_read_lock() around code that can block.  So we
+ * find a thin with the rcu lock held; bump a refcount; then drop
+ * the lock.
+ */
+static struct thin_c *get_first_thin(struct pool *pool)
+{
+	struct thin_c *tc = NULL;
+
+	rcu_read_lock();
+	if (!list_empty(&pool->active_thins)) {
+		tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+		thin_get(tc);
+	}
+	rcu_read_unlock();
+
+	return tc;
+}
+
+static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
+{
+	struct thin_c *old_tc = tc;
+
+	rcu_read_lock();
+	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
+		thin_get(tc);
+		thin_put(old_tc);
+		rcu_read_unlock();
+		return tc;
+	}
+	thin_put(old_tc);
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 static void process_deferred_bios(struct pool *pool)
 {
 	unsigned long flags;
@@ -1493,10 +1539,11 @@ static void process_deferred_bios(struct pool *pool)
 	struct bio_list bios;
 	struct thin_c *tc;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tc, &pool->active_thins, list)
+	tc = get_first_thin(pool);
+	while (tc) {
 		process_thin_deferred_bios(tc);
-	rcu_read_unlock();
+		tc = get_next_thin(pool, tc);
+	}
 
 	/*
 	 * If there are any deferred flush bios, we must commit
@@ -3061,11 +3108,25 @@ static struct target_type pool_target = {
 /*----------------------------------------------------------------
  * Thin target methods
  *--------------------------------------------------------------*/
+static void thin_get(struct thin_c *tc)
+{
+	atomic_inc(&tc->refcount);
+}
+
+static void thin_put(struct thin_c *tc)
+{
+	if (atomic_dec_and_test(&tc->refcount))
+		complete(&tc->can_destroy);
+}
+
 static void thin_dtr(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 	unsigned long flags;
 
+	thin_put(tc);
+	wait_for_completion(&tc->can_destroy);
+
 	spin_lock_irqsave(&tc->pool->lock, flags);
 	list_del_rcu(&tc->list);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
@@ -3192,6 +3253,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
 
+	atomic_set(&tc->refcount, 1);
+	init_completion(&tc->can_destroy);
+
 	spin_lock_irqsave(&tc->pool->lock, flags);
 	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
-- 
cgit v1.2.3-70-g09d2


From 3a7745215e7f73a5c7d9bcdc50661a55b39052a3 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Mon, 14 Apr 2014 22:02:30 +0200
Subject: dm verity: fix biovecs hash calculation regression

Commit 003b5c5719f159f4f4bf97511c4702a0638313dd ("block: Convert drivers
to immutable biovecs") incorrectly converted biovec iteration in
dm-verity to always calculate the hash from a full biovec, but the
function only needs to calculate the hash from part of the biovec (up to
the calculated "todo" value).

Fix this issue by limiting hash input to only the requested data size.

This problem was identified using the cryptsetup regression test for
veritysetup (verity-compat-test).

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.14+
---
 drivers/md/dm-verity.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 796007a5e0e..7a7bab8947a 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -330,15 +330,17 @@ test_block_hash:
 				return r;
 			}
 		}
-
 		todo = 1 << v->data_dev_block_bits;
-		while (io->iter.bi_size) {
+		do {
 			u8 *page;
+			unsigned len;
 			struct bio_vec bv = bio_iter_iovec(bio, io->iter);
 
 			page = kmap_atomic(bv.bv_page);
-			r = crypto_shash_update(desc, page + bv.bv_offset,
-						bv.bv_len);
+			len = bv.bv_len;
+			if (likely(len >= todo))
+				len = todo;
+			r = crypto_shash_update(desc, page + bv.bv_offset, len);
 			kunmap_atomic(page);
 
 			if (r < 0) {
@@ -346,8 +348,9 @@ test_block_hash:
 				return r;
 			}
 
-			bio_advance_iter(bio, &io->iter, bv.bv_len);
-		}
+			bio_advance_iter(bio, &io->iter, len);
+			todo -= len;
+		} while (todo);
 
 		if (!v->version) {
 			r = crypto_shash_update(desc, v->salt, v->salt_size);
-- 
cgit v1.2.3-70-g09d2


From fbcde3d8b9c2d97704b8ca299e5266147b24c8ee Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 29 Apr 2014 11:22:04 -0400
Subject: dm thin: use INIT_WORK_ONSTACK in noflush_work to avoid ODEBUG
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use INIT_WORK_ONSTACK to silence "ODEBUG: object is on stack, but not
annotated".

Reported-by: Zdeněk Kabeláč <zkabelac@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 28fc282b61b..13abade76ad 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1625,7 +1625,7 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
 {
 	struct noflush_work w;
 
-	INIT_WORK(&w.worker, fn);
+	INIT_WORK_ONSTACK(&w.worker, fn);
 	w.tc = tc;
 	atomic_set(&w.complete, 0);
 	init_waitqueue_head(&w.wait);
-- 
cgit v1.2.3-70-g09d2


From 131cd131a9ff63d4b84f3fe15073a2984ac30066 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 1 May 2014 16:14:24 -0400
Subject: dm cache: fix writethrough mode quiescing in cache_map

Commit 2ee57d58735 ("dm cache: add passthrough mode") inadvertently
removed the deferred set reference that was taken in cache_map()'s
writethrough mode support.  Restore taking this reference.

This issue was found with code inspection.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Cc: stable@vger.kernel.org # 3.13+
---
 drivers/md/dm-cache-target.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1bf4a71919e..9380be7b189 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2488,6 +2488,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 
 		} else {
 			inc_hit_counter(cache, bio);
+			pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 
 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
 			    !is_dirty(cache, lookup_result.cblock))
-- 
cgit v1.2.3-70-g09d2


From cc13b1d1500656a20e41960668f3392dda9fa6e2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 5 May 2014 13:34:37 +1000
Subject: md/raid10: call wait_barrier() for each request submitted.

wait_barrier() includes a counter, so we must call it precisely once
(unless balanced by allow_barrier()) for each request submitted.

Since
commit 20d0189b1012a37d2533a87fb451f7852f2418d1
    block: Introduce new bio_split()
in 3.14-rc1, we don't call it for the extra requests generated when
we need to split a bio.

When this happens the counter goes negative, any resync/recovery will
never start, and  "mdadm --stop" will hang.

Reported-by: Chris Murphy <lists@colorremedies.com>
Fixes: 20d0189b1012a37d2533a87fb451f7852f2418d1
Cc: stable@vger.kernel.org (3.14+)
Cc: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33fc408e5ea..cb882aae9e2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1172,6 +1172,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	int max_sectors;
 	int sectors;
 
+	/*
+	 * Register the new request and wait if the reconstruction
+	 * thread has put up a bar for new requests.
+	 * Continue immediately if no resync is active currently.
+	 */
+	wait_barrier(conf);
+
 	sectors = bio_sectors(bio);
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
@@ -1552,12 +1559,6 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 
 	md_write_start(mddev, bio);
 
-	/*
-	 * Register the new request and wait if the reconstruction
-	 * thread has put up a bar for new requests.
-	 * Continue immediately if no resync is active currently.
-	 */
-	wait_barrier(conf);
 
 	do {
 
-- 
cgit v1.2.3-70-g09d2


From 0f62fb220aa4ebabe8547d3a9ce4a16d3c045f21 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 6 May 2014 09:36:08 +1000
Subject: md: avoid possible spinning md thread at shutdown.

If an md array with externally managed metadata (e.g. DDF or IMSM)
is in use, then we should not set safemode==2 at shutdown because:

1/ this is ineffective: user-space need to be involved in any 'safemode' handling,
2/ The safemode management code doesn't cope with safemode==2 on external metadata
   and md_check_recover enters an infinite loop.

Even at shutdown, an infinite-looping process can be problematic, so this
could cause shutdown to hang.

Cc: stable@vger.kernel.org (any kernel)
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8fda38d23e3..237b7e0ddc7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8516,7 +8516,8 @@ static int md_notify_reboot(struct notifier_block *this,
 		if (mddev_trylock(mddev)) {
 			if (mddev->pers)
 				__md_stop_writes(mddev);
-			mddev->safemode = 2;
+			if (mddev->persistent)
+				mddev->safemode = 2;
 			mddev_unlock(mddev);
 		}
 		need_delay = 1;
-- 
cgit v1.2.3-70-g09d2


From 610f2de3559c383caf8fbbf91e9968102dff7ca0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 20 Feb 2014 18:01:01 -0500
Subject: dm crypt: fix cpu hotplug crash by removing per-cpu structure

The DM crypt target used per-cpu structures to hold pointers to a
ablkcipher_request structure.  The code assumed that the work item keeps
executing on a single CPU, so it didn't use synchronization when
accessing this structure.

If a CPU is disabled by writing 0 to /sys/devices/system/cpu/cpu*/online,
the work item could be moved to another CPU.  This causes dm-crypt
crashes, like the following, because the code starts using an incorrect
ablkcipher_request:

 smpboot: CPU 7 is now offline
 BUG: unable to handle kernel NULL pointer dereference at 0000000000000130
 IP: [<ffffffffa1862b3d>] crypt_convert+0x12d/0x3c0 [dm_crypt]
 ...
 Call Trace:
  [<ffffffffa1864415>] ? kcryptd_crypt+0x305/0x470 [dm_crypt]
  [<ffffffff81062060>] ? finish_task_switch+0x40/0xc0
  [<ffffffff81052a28>] ? process_one_work+0x168/0x470
  [<ffffffff8105366b>] ? worker_thread+0x10b/0x390
  [<ffffffff81053560>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff81058d9f>] ? kthread+0xaf/0xc0
  [<ffffffff81058cf0>] ? kthread_create_on_node+0x120/0x120
  [<ffffffff813464ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff81058cf0>] ? kthread_create_on_node+0x120/0x120

Fix this bug by removing the per-cpu definition.  The structure
ablkcipher_request is accessed via a pointer from convert_context.
Consequently, if the work item is rescheduled to a different CPU, the
thread still uses the same ablkcipher_request.

This change may undermine performance improvements intended by commit
c0297721 ("dm crypt: scale to multiple cpus") on select hardware.  In
practice no performance difference was observed on recent hardware.  But
regardless, correctness is more important than performance.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-crypt.c | 61 ++++++++++-----------------------------------------
 1 file changed, 12 insertions(+), 49 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 784695d22fd..53b213226c0 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -19,7 +19,6 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
-#include <linux/percpu.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -43,6 +42,7 @@ struct convert_context {
 	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
+	struct ablkcipher_request *req;
 };
 
 /*
@@ -111,15 +111,7 @@ struct iv_tcw_private {
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
 
 /*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
-	struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
  */
 struct crypt_config {
 	struct dm_dev *dev;
@@ -150,12 +142,6 @@ struct crypt_config {
 	sector_t iv_offset;
 	unsigned int iv_size;
 
-	/*
-	 * Duplicated per cpu state. Access through
-	 * per_cpu_ptr() only.
-	 */
-	struct crypt_cpu __percpu *cpu;
-
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
 	struct crypto_ablkcipher **tfms;
@@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
-	return this_cpu_ptr(cc->cpu);
-}
-
 /*
  * Use this to access cipher attributes that are the same for each CPU.
  */
@@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
-	if (!this_cc->req)
-		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	if (!ctx->req)
+		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
-	ablkcipher_request_set_callback(this_cc->req,
+	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+	ablkcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
 /*
@@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
@@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->cc_pending);
 
-		r = crypt_convert_block(cc, ctx, this_cc->req);
+		r = crypt_convert_block(cc, ctx, ctx->req);
 
 		switch (r) {
 		/* async */
@@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc,
 			reinit_completion(&ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			this_cc->req = NULL;
+			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
 
@@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
 	io->sector = sector;
 	io->error = 0;
 	io->base_io = NULL;
+	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 
 	return io;
@@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
 
+	if (io->ctx.req)
+		mempool_free(io->ctx.req, cc->req_pool);
 	mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
@@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
 static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = ti->private;
-	struct crypt_cpu *cpu_cc;
-	int cpu;
 
 	ti->private = NULL;
 
@@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
-	if (cc->cpu)
-		for_each_possible_cpu(cpu) {
-			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
-			if (cpu_cc->req)
-				mempool_free(cpu_cc->req, cc->req_pool);
-		}
-
 	crypt_free_tfms(cc);
 
 	if (cc->bs)
@@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->dev)
 		dm_put_device(ti, cc->dev);
 
-	if (cc->cpu)
-		free_percpu(cc->cpu);
-
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 
@@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
-				 __alignof__(struct crypt_cpu));
-	if (!cc->cpu) {
-		ti->error = "Cannot allocate per cpu state";
-		goto bad_mem;
-	}
-
 	/*
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.
-- 
cgit v1.2.3-70-g09d2


From 8d07e8a5f5bc7b90f755d9b427ea930024f4c986 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 6 May 2014 16:28:14 +0100
Subject: dm thin: allow metadata commit if pool is in PM_OUT_OF_DATA_SPACE
 mode

Commit 3e1a0699 ("dm thin: fix out of data space handling") introduced
a regression in the metadata commit() method by returning an error if
the pool is in PM_OUT_OF_DATA_SPACE mode.  This oversight caused a thin
device to return errors even if the default queue_if_no_space ENOSPC
handling mode is used.

Fix commit() to only fail if pool is in PM_READ_ONLY or PM_FAIL mode.

Reported-by: qindehua@163.com
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.14+
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 13abade76ad..15f748ba675 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -935,7 +935,7 @@ static int commit(struct pool *pool)
 {
 	int r;
 
-	if (get_pool_mode(pool) != PM_WRITE)
+	if (get_pool_mode(pool) >= PM_READ_ONLY)
 		return -EINVAL;
 
 	r = dm_pool_commit_metadata(pool->pmd);
-- 
cgit v1.2.3-70-g09d2


From 85ad643b7e7e52d37620fb272a9fd577a8095647 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 9 May 2014 15:59:38 +0100
Subject: dm thin: add timeout to stop out-of-data-space mode holding IO
 forever

If the pool runs out of data space, dm-thin can be configured to
either error IOs that would trigger provisioning, or hold those IOs
until the pool is resized.  Unfortunately, holding IOs until the pool is
resized can result in a cascade of tasks hitting the hung_task_timeout,
which may render the system unavailable.

Add a fixed timeout so IOs can only be held for a maximum of 60 seconds.
If LVM is going to resize a thin-pool that is out of data space it needs
to be prompt about it.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.14+
---
 drivers/md/dm-thin.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 15f748ba675..2e71de8e004 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -27,6 +27,7 @@
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
 #define COMMIT_PERIOD HZ
+#define NO_SPACE_TIMEOUT (HZ * 60)
 
 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 		"A percentage of time allocated for copy on write");
@@ -175,6 +176,7 @@ struct pool {
 	struct workqueue_struct *wq;
 	struct work_struct worker;
 	struct delayed_work waker;
+	struct delayed_work no_space_timeout;
 
 	unsigned long last_commit_jiffies;
 	unsigned ref_count;
@@ -1590,6 +1592,20 @@ static void do_waker(struct work_struct *ws)
 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
 }
 
+/*
+ * We're holding onto IO to allow userland time to react.  After the
+ * timeout either the pool will have been resized (and thus back in
+ * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ */
+static void do_no_space_timeout(struct work_struct *ws)
+{
+	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
+					 no_space_timeout);
+
+	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
+		set_pool_mode(pool, PM_READ_ONLY);
+}
+
 /*----------------------------------------------------------------*/
 
 struct noflush_work {
@@ -1715,6 +1731,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_discard = process_discard;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard_passdown;
+
+		if (!pool->pf.error_if_no_space)
+			queue_delayed_work(pool->wq, &pool->no_space_timeout, NO_SPACE_TIMEOUT);
 		break;
 
 	case PM_WRITE:
@@ -2100,6 +2119,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
 	INIT_WORK(&pool->worker, do_worker);
 	INIT_DELAYED_WORK(&pool->waker, do_waker);
+	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
 	spin_lock_init(&pool->lock);
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
@@ -2662,6 +2682,7 @@ static void pool_postsuspend(struct dm_target *ti)
 	struct pool *pool = pt->pool;
 
 	cancel_delayed_work(&pool->waker);
+	cancel_delayed_work(&pool->no_space_timeout);
 	flush_workqueue(pool->wq);
 	(void) commit(pool);
 }
-- 
cgit v1.2.3-70-g09d2


From 4cdd2ad78098244c1bc9ec4374ea1c225fd1cd6f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 13 May 2014 13:49:39 -0400
Subject: dm mpath: fix lock order inconsistency in multipath_ioctl

Commit 3e9f1be1b40 ("dm mpath: remove process_queued_ios()") did not
consistently take the multipath device's spinlock (m->lock) before
calling dm_table_run_md_queue_async() -- which takes the q->queue_lock.

Found with code inspection using hint from reported lockdep warning.

Reported-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aa009e86587..fa0f6cbd6a4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1566,8 +1566,8 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 		}
 		if (m->pg_init_required)
 			__pg_init_all_paths(m);
-		spin_unlock_irqrestore(&m->lock, flags);
 		dm_table_run_md_queue_async(m->ti->table);
+		spin_unlock_irqrestore(&m->lock, flags);
 	}
 
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-- 
cgit v1.2.3-70-g09d2