From 5e3283e2920a0bd8a806964d80274b8756e0dd7f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 8 Apr 2014 11:08:41 +0100 Subject: dm thin: irqsave must always be used with the pool->lock spinlock Commit c140e1c4e23 ("dm thin: use per thin device deferred bio lists") incorrectly stopped disabling irqs when taking the pool's spinlock. Irqs must be disabled when taking the pool's spinlock otherwise a thread could spin_lock(), then get interrupted to service thin_endio() in interrupt context, which would then deadlock in spin_lock_irqsave(). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 53728be84de..ae5fd0b9c75 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3101,6 +3101,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; + unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -3191,9 +3192,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock(&tc->pool->lock); + spin_lock_irqsave(&tc->pool->lock, flags); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock(&tc->pool->lock); + spin_unlock_irqrestore(&tc->pool->lock, flags); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly -- cgit v1.2.3-70-g09d2 From b10ebd34cccae1b431caf1be54919aede2be7cbe Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 8 Apr 2014 11:29:01 +0100 Subject: dm thin: fix rcu_read_lock being held in code that can sleep Commit c140e1c4e23 ("dm thin: use per thin device deferred bio lists") introduced the use of an rculist for all active thin devices. The use of rcu_read_lock() in process_deferred_bios() can result in a BUG if a dm_bio_prison_cell must be allocated as a side-effect of bio_detain(): BUG: sleeping function called from invalid context at mm/mempool.c:203 in_atomic(): 1, irqs_disabled(): 0, pid: 6, name: kworker/u8:0 3 locks held by kworker/u8:0/6: #0: ("dm-" "thin"){.+.+..}, at: [] process_one_work+0x192/0x550 #1: ((&pool->worker)){+.+...}, at: [] process_one_work+0x192/0x550 #2: (rcu_read_lock){.+.+..}, at: [] do_worker+0x5/0x4d0 We can't process deferred bios with the rcu lock held, since dm_bio_prison_cell allocation may block if the bio-prison's cell mempool is exhausted. To fix: - Introduce a refcount and completion field to each thin_c - Add thin_get/put methods for adjusting the refcount. If the refcount hits zero then the completion is triggered. - Initialise refcount to 1 when creating thin_c - When iterating the active_thins list we thin_get() whilst the rcu lock is held. - After the rcu lock is dropped we process the deferred bios for that thin. - When destroying a thin_c we thin_put() and then wait for the completion -- to avoid a race between the worker thread iterating from that thin_c and destroying the thin_c. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ae5fd0b9c75..28fc282b61b 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -232,6 +232,13 @@ struct thin_c { struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ + + /* + * Ensures the thin is not destroyed until the worker has finished + * iterating the active_thins list. + */ + atomic_t refcount; + struct completion can_destroy; }; /*----------------------------------------------------------------*/ @@ -1486,6 +1493,45 @@ static void process_thin_deferred_bios(struct thin_c *tc) blk_finish_plug(&plug); } +static void thin_get(struct thin_c *tc); +static void thin_put(struct thin_c *tc); + +/* + * We can't hold rcu_read_lock() around code that can block. So we + * find a thin with the rcu lock held; bump a refcount; then drop + * the lock. + */ +static struct thin_c *get_first_thin(struct pool *pool) +{ + struct thin_c *tc = NULL; + + rcu_read_lock(); + if (!list_empty(&pool->active_thins)) { + tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + thin_get(tc); + } + rcu_read_unlock(); + + return tc; +} + +static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) +{ + struct thin_c *old_tc = tc; + + rcu_read_lock(); + list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) { + thin_get(tc); + thin_put(old_tc); + rcu_read_unlock(); + return tc; + } + thin_put(old_tc); + rcu_read_unlock(); + + return NULL; +} + static void process_deferred_bios(struct pool *pool) { unsigned long flags; @@ -1493,10 +1539,11 @@ static void process_deferred_bios(struct pool *pool) struct bio_list bios; struct thin_c *tc; - rcu_read_lock(); - list_for_each_entry_rcu(tc, &pool->active_thins, list) + tc = get_first_thin(pool); + while (tc) { process_thin_deferred_bios(tc); - rcu_read_unlock(); + tc = get_next_thin(pool, tc); + } /* * If there are any deferred flush bios, we must commit @@ -3061,11 +3108,25 @@ static struct target_type pool_target = { /*---------------------------------------------------------------- * Thin target methods *--------------------------------------------------------------*/ +static void thin_get(struct thin_c *tc) +{ + atomic_inc(&tc->refcount); +} + +static void thin_put(struct thin_c *tc) +{ + if (atomic_dec_and_test(&tc->refcount)) + complete(&tc->can_destroy); +} + static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; unsigned long flags; + thin_put(tc); + wait_for_completion(&tc->can_destroy); + spin_lock_irqsave(&tc->pool->lock, flags); list_del_rcu(&tc->list); spin_unlock_irqrestore(&tc->pool->lock, flags); @@ -3192,6 +3253,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + spin_lock_irqsave(&tc->pool->lock, flags); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); spin_unlock_irqrestore(&tc->pool->lock, flags); -- cgit v1.2.3-70-g09d2 From 3a7745215e7f73a5c7d9bcdc50661a55b39052a3 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Mon, 14 Apr 2014 22:02:30 +0200 Subject: dm verity: fix biovecs hash calculation regression Commit 003b5c5719f159f4f4bf97511c4702a0638313dd ("block: Convert drivers to immutable biovecs") incorrectly converted biovec iteration in dm-verity to always calculate the hash from a full biovec, but the function only needs to calculate the hash from part of the biovec (up to the calculated "todo" value). Fix this issue by limiting hash input to only the requested data size. This problem was identified using the cryptsetup regression test for veritysetup (verity-compat-test). Signed-off-by: Milan Broz Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.14+ --- drivers/md/dm-verity.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 796007a5e0e..7a7bab8947a 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -330,15 +330,17 @@ test_block_hash: return r; } } - todo = 1 << v->data_dev_block_bits; - while (io->iter.bi_size) { + do { u8 *page; + unsigned len; struct bio_vec bv = bio_iter_iovec(bio, io->iter); page = kmap_atomic(bv.bv_page); - r = crypto_shash_update(desc, page + bv.bv_offset, - bv.bv_len); + len = bv.bv_len; + if (likely(len >= todo)) + len = todo; + r = crypto_shash_update(desc, page + bv.bv_offset, len); kunmap_atomic(page); if (r < 0) { @@ -346,8 +348,9 @@ test_block_hash: return r; } - bio_advance_iter(bio, &io->iter, bv.bv_len); - } + bio_advance_iter(bio, &io->iter, len); + todo -= len; + } while (todo); if (!v->version) { r = crypto_shash_update(desc, v->salt, v->salt_size); -- cgit v1.2.3-70-g09d2 From fbcde3d8b9c2d97704b8ca299e5266147b24c8ee Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 29 Apr 2014 11:22:04 -0400 Subject: dm thin: use INIT_WORK_ONSTACK in noflush_work to avoid ODEBUG warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use INIT_WORK_ONSTACK to silence "ODEBUG: object is on stack, but not annotated". Reported-by: Zdeněk Kabeláč Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-thin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 28fc282b61b..13abade76ad 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1625,7 +1625,7 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) { struct noflush_work w; - INIT_WORK(&w.worker, fn); + INIT_WORK_ONSTACK(&w.worker, fn); w.tc = tc; atomic_set(&w.complete, 0); init_waitqueue_head(&w.wait); -- cgit v1.2.3-70-g09d2 From 131cd131a9ff63d4b84f3fe15073a2984ac30066 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 1 May 2014 16:14:24 -0400 Subject: dm cache: fix writethrough mode quiescing in cache_map Commit 2ee57d58735 ("dm cache: add passthrough mode") inadvertently removed the deferred set reference that was taken in cache_map()'s writethrough mode support. Restore taking this reference. This issue was found with code inspection. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org # 3.13+ --- drivers/md/dm-cache-target.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1bf4a71919e..9380be7b189 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2488,6 +2488,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) } else { inc_hit_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) -- cgit v1.2.3-70-g09d2 From cc13b1d1500656a20e41960668f3392dda9fa6e2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 5 May 2014 13:34:37 +1000 Subject: md/raid10: call wait_barrier() for each request submitted. wait_barrier() includes a counter, so we must call it precisely once (unless balanced by allow_barrier()) for each request submitted. Since commit 20d0189b1012a37d2533a87fb451f7852f2418d1 block: Introduce new bio_split() in 3.14-rc1, we don't call it for the extra requests generated when we need to split a bio. When this happens the counter goes negative, any resync/recovery will never start, and "mdadm --stop" will hang. Reported-by: Chris Murphy Fixes: 20d0189b1012a37d2533a87fb451f7852f2418d1 Cc: stable@vger.kernel.org (3.14+) Cc: Kent Overstreet Signed-off-by: NeilBrown --- drivers/md/raid10.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 33fc408e5ea..cb882aae9e2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1172,6 +1172,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio) int max_sectors; int sectors; + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + wait_barrier(conf); + sectors = bio_sectors(bio); while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && @@ -1552,12 +1559,6 @@ static void make_request(struct mddev *mddev, struct bio *bio) md_write_start(mddev, bio); - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - wait_barrier(conf); do { -- cgit v1.2.3-70-g09d2 From 0f62fb220aa4ebabe8547d3a9ce4a16d3c045f21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 6 May 2014 09:36:08 +1000 Subject: md: avoid possible spinning md thread at shutdown. If an md array with externally managed metadata (e.g. DDF or IMSM) is in use, then we should not set safemode==2 at shutdown because: 1/ this is ineffective: user-space need to be involved in any 'safemode' handling, 2/ The safemode management code doesn't cope with safemode==2 on external metadata and md_check_recover enters an infinite loop. Even at shutdown, an infinite-looping process can be problematic, so this could cause shutdown to hang. Cc: stable@vger.kernel.org (any kernel) Signed-off-by: NeilBrown --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8fda38d23e3..237b7e0ddc7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8516,7 +8516,8 @@ static int md_notify_reboot(struct notifier_block *this, if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); - mddev->safemode = 2; + if (mddev->persistent) + mddev->safemode = 2; mddev_unlock(mddev); } need_delay = 1; -- cgit v1.2.3-70-g09d2 From 610f2de3559c383caf8fbbf91e9968102dff7ca0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 20 Feb 2014 18:01:01 -0500 Subject: dm crypt: fix cpu hotplug crash by removing per-cpu structure The DM crypt target used per-cpu structures to hold pointers to a ablkcipher_request structure. The code assumed that the work item keeps executing on a single CPU, so it didn't use synchronization when accessing this structure. If a CPU is disabled by writing 0 to /sys/devices/system/cpu/cpu*/online, the work item could be moved to another CPU. This causes dm-crypt crashes, like the following, because the code starts using an incorrect ablkcipher_request: smpboot: CPU 7 is now offline BUG: unable to handle kernel NULL pointer dereference at 0000000000000130 IP: [] crypt_convert+0x12d/0x3c0 [dm_crypt] ... Call Trace: [] ? kcryptd_crypt+0x305/0x470 [dm_crypt] [] ? finish_task_switch+0x40/0xc0 [] ? process_one_work+0x168/0x470 [] ? worker_thread+0x10b/0x390 [] ? manage_workers.isra.26+0x290/0x290 [] ? kthread+0xaf/0xc0 [] ? kthread_create_on_node+0x120/0x120 [] ? ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_node+0x120/0x120 Fix this bug by removing the per-cpu definition. The structure ablkcipher_request is accessed via a pointer from convert_context. Consequently, if the work item is rescheduled to a different CPU, the thread still uses the same ablkcipher_request. This change may undermine performance improvements intended by commit c0297721 ("dm crypt: scale to multiple cpus") on select hardware. In practice no performance difference was observed on recent hardware. But regardless, correctness is more important than performance. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-crypt.c | 61 ++++++++++----------------------------------------- 1 file changed, 12 insertions(+), 49 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 784695d22fd..53b213226c0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -43,6 +42,7 @@ struct convert_context { struct bvec_iter iter_out; sector_t cc_sector; atomic_t cc_pending; + struct ablkcipher_request *req; }; /* @@ -111,15 +111,7 @@ struct iv_tcw_private { enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; /* - * Duplicated per-CPU state for cipher. - */ -struct crypt_cpu { - struct ablkcipher_request *req; -}; - -/* - * The fields in here must be read only after initialization, - * changing state should be in crypt_cpu. + * The fields in here must be read only after initialization. */ struct crypt_config { struct dm_dev *dev; @@ -150,12 +142,6 @@ struct crypt_config { sector_t iv_offset; unsigned int iv_size; - /* - * Duplicated per cpu state. Access through - * per_cpu_ptr() only. - */ - struct crypt_cpu __percpu *cpu; - /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; struct crypto_ablkcipher **tfms; @@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); -static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) -{ - return this_cpu_ptr(cc->cpu); -} - /* * Use this to access cipher attributes that are the same for each CPU. */ @@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - struct crypt_cpu *this_cc = this_crypt_config(cc); unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); - if (!this_cc->req) - this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + if (!ctx->req) + ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]); - ablkcipher_request_set_callback(this_cc->req, + ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + ablkcipher_request_set_callback(ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); + kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } /* @@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { - struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->cc_pending, 1); @@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->cc_pending); - r = crypt_convert_block(cc, ctx, this_cc->req); + r = crypt_convert_block(cc, ctx, ctx->req); switch (r) { /* async */ @@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc, reinit_completion(&ctx->restart); /* fall through*/ case -EINPROGRESS: - this_cc->req = NULL; + ctx->req = NULL; ctx->cc_sector++; continue; @@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, io->sector = sector; io->error = 0; io->base_io = NULL; + io->ctx.req = NULL; atomic_set(&io->io_pending, 0); return io; @@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (!atomic_dec_and_test(&io->io_pending)) return; + if (io->ctx.req) + mempool_free(io->ctx.req, cc->req_pool); mempool_free(io, cc->io_pool); if (likely(!base_io)) @@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc) static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; - struct crypt_cpu *cpu_cc; - int cpu; ti->private = NULL; @@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); - if (cc->cpu) - for_each_possible_cpu(cpu) { - cpu_cc = per_cpu_ptr(cc->cpu, cpu); - if (cpu_cc->req) - mempool_free(cpu_cc->req, cc->req_pool); - } - crypt_free_tfms(cc); if (cc->bs) @@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->dev) dm_put_device(ti, cc->dev); - if (cc->cpu) - free_percpu(cc->cpu); - kzfree(cc->cipher); kzfree(cc->cipher_string); @@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)), - __alignof__(struct crypt_cpu)); - if (!cc->cpu) { - ti->error = "Cannot allocate per cpu state"; - goto bad_mem; - } - /* * For compatibility with the original dm-crypt mapping format, if * only the cipher name is supplied, use cbc-plain. -- cgit v1.2.3-70-g09d2 From 8d07e8a5f5bc7b90f755d9b427ea930024f4c986 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 6 May 2014 16:28:14 +0100 Subject: dm thin: allow metadata commit if pool is in PM_OUT_OF_DATA_SPACE mode Commit 3e1a0699 ("dm thin: fix out of data space handling") introduced a regression in the metadata commit() method by returning an error if the pool is in PM_OUT_OF_DATA_SPACE mode. This oversight caused a thin device to return errors even if the default queue_if_no_space ENOSPC handling mode is used. Fix commit() to only fail if pool is in PM_READ_ONLY or PM_FAIL mode. Reported-by: qindehua@163.com Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.14+ --- drivers/md/dm-thin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 13abade76ad..15f748ba675 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -935,7 +935,7 @@ static int commit(struct pool *pool) { int r; - if (get_pool_mode(pool) != PM_WRITE) + if (get_pool_mode(pool) >= PM_READ_ONLY) return -EINVAL; r = dm_pool_commit_metadata(pool->pmd); -- cgit v1.2.3-70-g09d2 From 85ad643b7e7e52d37620fb272a9fd577a8095647 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 9 May 2014 15:59:38 +0100 Subject: dm thin: add timeout to stop out-of-data-space mode holding IO forever If the pool runs out of data space, dm-thin can be configured to either error IOs that would trigger provisioning, or hold those IOs until the pool is resized. Unfortunately, holding IOs until the pool is resized can result in a cascade of tasks hitting the hung_task_timeout, which may render the system unavailable. Add a fixed timeout so IOs can only be held for a maximum of 60 seconds. If LVM is going to resize a thin-pool that is out of data space it needs to be prompt about it. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.14+ --- drivers/md/dm-thin.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 15f748ba675..2e71de8e004 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -27,6 +27,7 @@ #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ +#define NO_SPACE_TIMEOUT (HZ * 60) DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write"); @@ -175,6 +176,7 @@ struct pool { struct workqueue_struct *wq; struct work_struct worker; struct delayed_work waker; + struct delayed_work no_space_timeout; unsigned long last_commit_jiffies; unsigned ref_count; @@ -1590,6 +1592,20 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } +/* + * We're holding onto IO to allow userland time to react. After the + * timeout either the pool will have been resized (and thus back in + * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + */ +static void do_no_space_timeout(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, + no_space_timeout); + + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) + set_pool_mode(pool, PM_READ_ONLY); +} + /*----------------------------------------------------------------*/ struct noflush_work { @@ -1715,6 +1731,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; + + if (!pool->pf.error_if_no_space) + queue_delayed_work(pool->wq, &pool->no_space_timeout, NO_SPACE_TIMEOUT); break; case PM_WRITE: @@ -2100,6 +2119,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); + INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); @@ -2662,6 +2682,7 @@ static void pool_postsuspend(struct dm_target *ti) struct pool *pool = pt->pool; cancel_delayed_work(&pool->waker); + cancel_delayed_work(&pool->no_space_timeout); flush_workqueue(pool->wq); (void) commit(pool); } -- cgit v1.2.3-70-g09d2 From 4cdd2ad78098244c1bc9ec4374ea1c225fd1cd6f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 13 May 2014 13:49:39 -0400 Subject: dm mpath: fix lock order inconsistency in multipath_ioctl Commit 3e9f1be1b40 ("dm mpath: remove process_queued_ios()") did not consistently take the multipath device's spinlock (m->lock) before calling dm_table_run_md_queue_async() -- which takes the q->queue_lock. Found with code inspection using hint from reported lockdep warning. Reported-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa009e86587..fa0f6cbd6a4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1566,8 +1566,8 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, } if (m->pg_init_required) __pg_init_all_paths(m); - spin_unlock_irqrestore(&m->lock, flags); dm_table_run_md_queue_async(m->ti->table); + spin_unlock_irqrestore(&m->lock, flags); } return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); -- cgit v1.2.3-70-g09d2