diff options
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r-- | drivers/md/dm-thin.c | 197 |
1 files changed, 158 insertions, 39 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 53728be84de..fc9c848a60c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -27,6 +27,9 @@ #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ +#define NO_SPACE_TIMEOUT_SECS 60 + +static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS; DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write"); @@ -175,6 +178,7 @@ struct pool { struct workqueue_struct *wq; struct work_struct worker; struct delayed_work waker; + struct delayed_work no_space_timeout; unsigned long last_commit_jiffies; unsigned ref_count; @@ -232,6 +236,13 @@ struct thin_c { struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ + + /* + * Ensures the thin is not destroyed until the worker has finished + * iterating the active_thins list. + */ + atomic_t refcount; + struct completion can_destroy; }; /*----------------------------------------------------------------*/ @@ -299,13 +310,18 @@ static void cell_defer_no_holder_no_free(struct thin_c *tc, wake_worker(pool); } -static void cell_error(struct pool *pool, - struct dm_bio_prison_cell *cell) +static void cell_error_with_code(struct pool *pool, + struct dm_bio_prison_cell *cell, int error_code) { - dm_cell_error(pool->prison, cell); + dm_cell_error(pool->prison, cell, error_code); dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, -EIO); +} + /*----------------------------------------------------------------*/ /* @@ -928,7 +944,7 @@ static int commit(struct pool *pool) { int r; - if (get_pool_mode(pool) != PM_WRITE) + if (get_pool_mode(pool) >= PM_READ_ONLY) return -EINVAL; r = dm_pool_commit_metadata(pool->pmd); @@ -1016,7 +1032,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&tc->lock, flags); } -static bool should_error_unserviceable_bio(struct pool *pool) +static int should_error_unserviceable_bio(struct pool *pool) { enum pool_mode m = get_pool_mode(pool); @@ -1024,25 +1040,27 @@ static bool should_error_unserviceable_bio(struct pool *pool) case PM_WRITE: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); - return true; + return -EIO; case PM_OUT_OF_DATA_SPACE: - return pool->pf.error_if_no_space; + return pool->pf.error_if_no_space ? -ENOSPC : 0; case PM_READ_ONLY: case PM_FAIL: - return true; + return -EIO; default: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); - return true; + return -EIO; } } static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { - if (should_error_unserviceable_bio(pool)) - bio_io_error(bio); + int error = should_error_unserviceable_bio(pool); + + if (error) + bio_endio(bio, error); else retry_on_resume(bio); } @@ -1051,18 +1069,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c { struct bio *bio; struct bio_list bios; + int error; - if (should_error_unserviceable_bio(pool)) { - cell_error(pool, cell); + error = should_error_unserviceable_bio(pool); + if (error) { + cell_error_with_code(pool, cell, error); return; } bio_list_init(&bios); cell_release(pool, cell, &bios); - if (should_error_unserviceable_bio(pool)) + error = should_error_unserviceable_bio(pool); + if (error) while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + bio_endio(bio, error); else while ((bio = bio_list_pop(&bios))) retry_on_resume(bio); @@ -1486,6 +1507,45 @@ static void process_thin_deferred_bios(struct thin_c *tc) blk_finish_plug(&plug); } +static void thin_get(struct thin_c *tc); +static void thin_put(struct thin_c *tc); + +/* + * We can't hold rcu_read_lock() around code that can block. So we + * find a thin with the rcu lock held; bump a refcount; then drop + * the lock. + */ +static struct thin_c *get_first_thin(struct pool *pool) +{ + struct thin_c *tc = NULL; + + rcu_read_lock(); + if (!list_empty(&pool->active_thins)) { + tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + thin_get(tc); + } + rcu_read_unlock(); + + return tc; +} + +static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) +{ + struct thin_c *old_tc = tc; + + rcu_read_lock(); + list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) { + thin_get(tc); + thin_put(old_tc); + rcu_read_unlock(); + return tc; + } + thin_put(old_tc); + rcu_read_unlock(); + + return NULL; +} + static void process_deferred_bios(struct pool *pool) { unsigned long flags; @@ -1493,10 +1553,11 @@ static void process_deferred_bios(struct pool *pool) struct bio_list bios; struct thin_c *tc; - rcu_read_lock(); - list_for_each_entry_rcu(tc, &pool->active_thins, list) + tc = get_first_thin(pool); + while (tc) { process_thin_deferred_bios(tc); - rcu_read_unlock(); + tc = get_next_thin(pool, tc); + } /* * If there are any deferred flush bios, we must commit @@ -1543,49 +1604,79 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } +/* + * We're holding onto IO to allow userland time to react. After the + * timeout either the pool will have been resized (and thus back in + * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + */ +static void do_no_space_timeout(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, + no_space_timeout); + + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) + set_pool_mode(pool, PM_READ_ONLY); +} + /*----------------------------------------------------------------*/ -struct noflush_work { +struct pool_work { struct work_struct worker; - struct thin_c *tc; + struct completion complete; +}; + +static struct pool_work *to_pool_work(struct work_struct *ws) +{ + return container_of(ws, struct pool_work, worker); +} + +static void pool_work_complete(struct pool_work *pw) +{ + complete(&pw->complete); +} + +static void pool_work_wait(struct pool_work *pw, struct pool *pool, + void (*fn)(struct work_struct *)) +{ + INIT_WORK_ONSTACK(&pw->worker, fn); + init_completion(&pw->complete); + queue_work(pool->wq, &pw->worker); + wait_for_completion(&pw->complete); +} + +/*----------------------------------------------------------------*/ - atomic_t complete; - wait_queue_head_t wait; +struct noflush_work { + struct pool_work pw; + struct thin_c *tc; }; -static void complete_noflush_work(struct noflush_work *w) +static struct noflush_work *to_noflush(struct work_struct *ws) { - atomic_set(&w->complete, 1); - wake_up(&w->wait); + return container_of(to_pool_work(ws), struct noflush_work, pw); } static void do_noflush_start(struct work_struct *ws) { - struct noflush_work *w = container_of(ws, struct noflush_work, worker); + struct noflush_work *w = to_noflush(ws); w->tc->requeue_mode = true; requeue_io(w->tc); - complete_noflush_work(w); + pool_work_complete(&w->pw); } static void do_noflush_stop(struct work_struct *ws) { - struct noflush_work *w = container_of(ws, struct noflush_work, worker); + struct noflush_work *w = to_noflush(ws); w->tc->requeue_mode = false; - complete_noflush_work(w); + pool_work_complete(&w->pw); } static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) { struct noflush_work w; - INIT_WORK(&w.worker, fn); w.tc = tc; - atomic_set(&w.complete, 0); - init_waitqueue_head(&w.wait); - - queue_work(tc->pool->wq, &w.worker); - - wait_event(w.wait, atomic_read(&w.complete)); + pool_work_wait(&w.pw, tc->pool, fn); } /*----------------------------------------------------------------*/ @@ -1607,6 +1698,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) struct pool_c *pt = pool->ti->private; bool needs_check = dm_pool_metadata_needs_check(pool->pmd); enum pool_mode old_mode = get_pool_mode(pool); + unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; /* * Never allow the pool to transition to PM_WRITE mode if user @@ -1668,6 +1760,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; + + if (!pool->pf.error_if_no_space && no_space_timeout) + queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); break; case PM_WRITE: @@ -2053,6 +2148,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); + INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); @@ -2615,6 +2711,7 @@ static void pool_postsuspend(struct dm_target *ti) struct pool *pool = pt->pool; cancel_delayed_work(&pool->waker); + cancel_delayed_work(&pool->no_space_timeout); flush_workqueue(pool->wq); (void) commit(pool); } @@ -2997,7 +3094,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) */ if (pt->adjusted_pf.discard_passdown) { data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; - limits->discard_granularity = data_limits->discard_granularity; + limits->discard_granularity = max(data_limits->discard_granularity, + pool->sectors_per_block << SECTOR_SHIFT); } else limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; } @@ -3061,11 +3159,25 @@ static struct target_type pool_target = { /*---------------------------------------------------------------- * Thin target methods *--------------------------------------------------------------*/ +static void thin_get(struct thin_c *tc) +{ + atomic_inc(&tc->refcount); +} + +static void thin_put(struct thin_c *tc) +{ + if (atomic_dec_and_test(&tc->refcount)) + complete(&tc->can_destroy); +} + static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; unsigned long flags; + thin_put(tc); + wait_for_completion(&tc->can_destroy); + spin_lock_irqsave(&tc->pool->lock, flags); list_del_rcu(&tc->list); spin_unlock_irqrestore(&tc->pool->lock, flags); @@ -3101,6 +3213,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; + unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -3191,9 +3304,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock(&tc->pool->lock); + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + + spin_lock_irqsave(&tc->pool->lock, flags); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock(&tc->pool->lock); + spin_unlock_irqrestore(&tc->pool->lock, flags); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly @@ -3422,6 +3538,9 @@ static void dm_thin_exit(void) module_init(dm_thin_init); module_exit(dm_thin_exit); +module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds"); + MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); |