diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-04-14 13:18:27 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-04-14 13:19:04 +0200 |
commit | 6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e (patch) | |
tree | 021cc9f6b477146fcebe6f3be4752abfa2ba18a9 /drivers/md/dm-thin.c | |
parent | 682968e0c425c60f0dde37977e5beb2b12ddc4cc (diff) | |
parent | a385ec4f11bdcf81af094c03e2444ee9b7fad2e5 (diff) |
Merge branch 'perf/core' into perf/uprobes
Merge in latest upstream (and the latest perf development tree),
to prepare for tooling changes, and also to pick up v3.4 MM
changes that the uprobes code needs to take care of.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r-- | drivers/md/dm-thin.c | 680 |
1 files changed, 508 insertions, 172 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c3087575fef..213ae32a0fc 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -23,6 +23,7 @@ #define DEFERRED_SET_SIZE 64 #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 +#define COMMIT_PERIOD HZ /* * The block size of the device holding pool data must be @@ -32,16 +33,6 @@ #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) /* - * The metadata device is currently limited in size. The limitation is - * checked lower down in dm-space-map-metadata, but we also check it here - * so we can fail early. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. - */ -#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) - -/* * Device id is restricted to 24 bits. */ #define MAX_DEV_ID ((1 << 24) - 1) @@ -72,7 +63,7 @@ * missed out if the io covers the block. (schedule_copy). * * iv) insert the new mapping into the origin's btree - * (process_prepared_mappings). This act of inserting breaks some + * (process_prepared_mapping). This act of inserting breaks some * sharing of btree nodes between the two devices. Breaking sharing only * effects the btree of that specific device. Btrees for the other * devices that share the block never change. The btree for the origin @@ -124,7 +115,7 @@ struct cell { struct hlist_node list; struct bio_prison *prison; struct cell_key key; - unsigned count; + struct bio *holder; struct bio_list bios; }; @@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket, * This may block if a new cell needs allocating. You must ensure that * cells will be unlocked even if the calling thread is blocked. * - * Returns the number of entries in the cell prior to the new addition - * or < 0 on failure. + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. */ static int bio_detain(struct bio_prison *prison, struct cell_key *key, struct bio *inmate, struct cell **ref) { - int r; + int r = 1; unsigned long flags; uint32_t hash = hash_key(prison, key); - struct cell *uninitialized_var(cell), *cell2 = NULL; + struct cell *cell, *cell2; BUG_ON(hash > prison->nr_buckets); spin_lock_irqsave(&prison->lock, flags); + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + bio_list_add(&cell->bios, inmate); + goto out; + } - if (!cell) { - /* - * Allocate a new cell - */ - spin_unlock_irqrestore(&prison->lock, flags); - cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); - spin_lock_irqsave(&prison->lock, flags); + /* + * Allocate a new cell + */ + spin_unlock_irqrestore(&prison->lock, flags); + cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); + spin_lock_irqsave(&prison->lock, flags); - /* - * We've been unlocked, so we have to double check that - * nobody else has inserted this cell in the meantime. - */ - cell = __search_bucket(prison->cells + hash, key); + /* + * We've been unlocked, so we have to double check that + * nobody else has inserted this cell in the meantime. + */ + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + mempool_free(cell2, prison->cell_pool); + bio_list_add(&cell->bios, inmate); + goto out; + } - if (!cell) { - cell = cell2; - cell2 = NULL; + /* + * Use new cell. + */ + cell = cell2; - cell->prison = prison; - memcpy(&cell->key, key, sizeof(cell->key)); - cell->count = 0; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); - } - } + cell->prison = prison; + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = inmate; + bio_list_init(&cell->bios); + hlist_add_head(&cell->list, prison->cells + hash); - r = cell->count++; - bio_list_add(&cell->bios, inmate); - spin_unlock_irqrestore(&prison->lock, flags); + r = 0; - if (cell2) - mempool_free(cell2, prison->cell_pool); +out: + spin_unlock_irqrestore(&prison->lock, flags); *ref = cell; @@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) hlist_del(&cell->list); - if (inmates) - bio_list_merge(inmates, &cell->bios); + bio_list_add(inmates, cell->holder); + bio_list_merge(inmates, &cell->bios); mempool_free(cell, prison->cell_pool); } @@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios) * bio may be in the cell. This function releases the cell, and also does * a sanity check. */ +static void __cell_release_singleton(struct cell *cell, struct bio *bio) +{ + hlist_del(&cell->list); + BUG_ON(cell->holder != bio); + BUG_ON(!bio_list_empty(&cell->bios)); +} + static void cell_release_singleton(struct cell *cell, struct bio *bio) { - struct bio_prison *prison = cell->prison; - struct bio_list bios; - struct bio *b; unsigned long flags; - - bio_list_init(&bios); + struct bio_prison *prison = cell->prison; spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, &bios); + __cell_release_singleton(cell, bio); spin_unlock_irqrestore(&prison->lock, flags); +} + +/* + * Sometimes we don't want the holder, just the additional bios. + */ +static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +{ + struct bio_prison *prison = cell->prison; + + hlist_del(&cell->list); + bio_list_merge(inmates, &cell->bios); - b = bio_list_pop(&bios); - BUG_ON(b != bio); - BUG_ON(!bio_list_empty(&bios)); + mempool_free(cell, prison->cell_pool); +} + +static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +{ + unsigned long flags; + struct bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); } static void cell_error(struct cell *cell) @@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, * devices. */ struct new_mapping; + +struct pool_features { + unsigned zero_new_blocks:1; + unsigned discard_enabled:1; + unsigned discard_passdown:1; +}; + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -484,7 +509,7 @@ struct pool { dm_block_t offset_mask; dm_block_t low_water_blocks; - unsigned zero_new_blocks:1; + struct pool_features pf; unsigned low_water_triggered:1; /* A dm event has been sent */ unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ @@ -493,17 +518,21 @@ struct pool { struct workqueue_struct *wq; struct work_struct worker; + struct delayed_work waker; unsigned ref_count; + unsigned long last_commit_jiffies; spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; struct list_head prepared_mappings; + struct list_head prepared_discards; struct bio_list retry_on_resume_list; - struct deferred_set ds; /* FIXME: move to thin_c */ + struct deferred_set shared_read_ds; + struct deferred_set all_io_ds; struct new_mapping *next_mapping; mempool_t *mapping_pool; @@ -521,7 +550,7 @@ struct pool_c { struct dm_target_callbacks callbacks; dm_block_t low_water_blocks; - unsigned zero_new_blocks:1; + struct pool_features pf; }; /* @@ -529,6 +558,7 @@ struct pool_c { */ struct thin_c { struct dm_dev *pool_dev; + struct dm_dev *origin_dev; dm_thin_id dev_id; struct pool *pool; @@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev /*----------------------------------------------------------------*/ +struct endio_hook { + struct thin_c *tc; + struct deferred_entry *shared_read_entry; + struct deferred_entry *all_io_entry; + struct new_mapping *overwrite_mapping; +}; + static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) { struct bio *bio; @@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) bio_list_init(master); while ((bio = bio_list_pop(&bios))) { - if (dm_get_mapinfo(bio)->ptr == tc) + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + if (h->tc == tc) bio_endio(bio, DM_ENDIO_REQUEUE); else bio_list_add(master, bio); @@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) (bio->bi_sector & pool->offset_mask); } -static void remap_and_issue(struct thin_c *tc, struct bio *bio, - dm_block_t block) +static void remap_to_origin(struct thin_c *tc, struct bio *bio) +{ + bio->bi_bdev = tc->origin_dev->bdev; +} + +static void issue(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; unsigned long flags; - remap(tc, bio, block); - /* * Batch together any FUA/FLUSH bios we find and then issue * a single commit for them in process_deferred_bios(). @@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, generic_make_request(bio); } +static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) +{ + remap_to_origin(tc, bio); + issue(tc, bio); +} + +static void remap_and_issue(struct thin_c *tc, struct bio *bio, + dm_block_t block) +{ + remap(tc, bio, block); + issue(tc, bio); +} + /* * wake_worker() is used when new work is queued and when pool_resume is * ready to continue deferred IO processing. @@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool) /* * Bio endio functions. */ -struct endio_hook { - struct thin_c *tc; - bio_end_io_t *saved_bi_end_io; - struct deferred_entry *entry; -}; - struct new_mapping { struct list_head list; - int prepared; + unsigned quiesced:1; + unsigned prepared:1; + unsigned pass_discard:1; struct thin_c *tc; dm_block_t virt_block; dm_block_t data_block; - struct cell *cell; + struct cell *cell, *cell2; int err; /* @@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m) { struct pool *pool = m->tc->pool; - if (list_empty(&m->list) && m->prepared) { + if (m->quiesced && m->prepared) { list_add(&m->list, &pool->prepared_mappings); wake_worker(pool); } @@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) static void overwrite_endio(struct bio *bio, int err) { unsigned long flags; - struct new_mapping *m = dm_get_mapinfo(bio)->ptr; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct new_mapping *m = h->overwrite_mapping; struct pool *pool = m->tc->pool; m->err = err; @@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err) spin_unlock_irqrestore(&pool->lock, flags); } -static void shared_read_endio(struct bio *bio, int err) -{ - struct list_head mappings; - struct new_mapping *m, *tmp; - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - unsigned long flags; - struct pool *pool = h->tc->pool; - - bio->bi_end_io = h->saved_bi_end_io; - bio_endio(bio, err); - - INIT_LIST_HEAD(&mappings); - ds_dec(h->entry, &mappings); - - spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry_safe(m, tmp, &mappings, list) { - list_del(&m->list); - INIT_LIST_HEAD(&m->list); - __maybe_add_mapping(m); - } - spin_unlock_irqrestore(&pool->lock, flags); - - mempool_free(h, pool->endio_hook_pool); -} - /*----------------------------------------------------------------*/ /* @@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, * Same as cell_defer above, except it omits one particular detainee, * a write bio that covers the block and has already been processed. */ -static void cell_defer_except(struct thin_c *tc, struct cell *cell, - struct bio *exception) +static void cell_defer_except(struct thin_c *tc, struct cell *cell) { struct bio_list bios; - struct bio *bio; struct pool *pool = tc->pool; unsigned long flags; bio_list_init(&bios); - cell_release(cell, &bios); spin_lock_irqsave(&pool->lock, flags); - while ((bio = bio_list_pop(&bios))) - if (bio != exception) - bio_list_add(&pool->deferred_bios, bio); + cell_release_no_holder(cell, &pool->deferred_bios); spin_unlock_irqrestore(&pool->lock, flags); wake_worker(pool); @@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_except(tc, m->cell, bio); + cell_defer_except(tc, m->cell); bio_endio(bio, 0); } else cell_defer(tc, m->cell, m->data_block); @@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m) mempool_free(m, tc->pool->mapping_pool); } -static void process_prepared_mappings(struct pool *pool) +static void process_prepared_discard(struct new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_block(tc->td, m->virt_block); + if (r) + DMERR("dm_thin_remove_block() failed"); + + /* + * Pass the discard down to the underlying device? + */ + if (m->pass_discard) + remap_and_issue(tc, m->bio, m->data_block); + else + bio_endio(m->bio, 0); + + cell_defer_except(tc, m->cell); + cell_defer_except(tc, m->cell2); + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared(struct pool *pool, struct list_head *head, + void (*fn)(struct new_mapping *)) { unsigned long flags; struct list_head maps; @@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool) INIT_LIST_HEAD(&maps); spin_lock_irqsave(&pool->lock, flags); - list_splice_init(&pool->prepared_mappings, &maps); + list_splice_init(head, &maps); spin_unlock_irqrestore(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &maps, list) - process_prepared_mapping(m); + fn(m); } /* * Deferred bio jobs. */ -static int io_overwrites_block(struct pool *pool, struct bio *bio) +static int io_overlaps_block(struct pool *pool, struct bio *bio) { - return ((bio_data_dir(bio) == WRITE) && - !(bio->bi_sector & pool->offset_mask)) && + return !(bio->bi_sector & pool->offset_mask) && (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); + +} + +static int io_overwrites_block(struct pool *pool, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + io_overlaps_block(pool, bio); } static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, @@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool) } static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_origin, dm_block_t data_dest, + struct dm_dev *origin, dm_block_t data_origin, + dm_block_t data_dest, struct cell *cell, struct bio *bio) { int r; @@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); + m->quiesced = 0; m->prepared = 0; m->tc = tc; m->virt_block = virt_block; @@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, m->err = 0; m->bio = NULL; - ds_add_work(&pool->ds, &m->list); + if (!ds_add_work(&pool->shared_read_ds, &m->list)) + m->quiesced = 1; /* * IO to pool_dev remaps to the pool target's data_dev. @@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * bio immediately. Otherwise we use kcopyd to clone the data first. */ if (io_overwrites_block(pool, bio)) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - dm_get_mapinfo(bio)->ptr = m; remap_and_issue(tc, bio, data_dest); } else { struct dm_io_region from, to; - from.bdev = tc->pool_dev->bdev; + from.bdev = origin->bdev; from.sector = data_origin * pool->sectors_per_block; from.count = pool->sectors_per_block; @@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, } } +static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_origin, dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->pool_dev, + data_origin, data_dest, cell, bio); +} + +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio); +} + static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, dm_block_t data_block, struct cell *cell, struct bio *bio) @@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, struct new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); + m->quiesced = 1; m->prepared = 0; m->tc = tc; m->virt_block = virt_block; @@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, * zeroing pre-existing data, we can issue the bio immediately. * Otherwise we use kcopyd to zero the data first. */ - if (!pool->zero_new_blocks) + if (!pool->pf.zero_new_blocks) process_prepared_mapping(m); else if (io_overwrites_block(pool, bio)) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - dm_get_mapinfo(bio)->ptr = m; remap_and_issue(tc, bio, data_block); } else { @@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) */ static void retry_on_resume(struct bio *bio) { - struct thin_c *tc = dm_get_mapinfo(bio)->ptr; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct thin_c *tc = h->tc; struct pool *pool = tc->pool; unsigned long flags; @@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell) retry_on_resume(bio); } +static void process_discard(struct thin_c *tc, struct bio *bio) +{ + int r; + struct pool *pool = tc->pool; + struct cell *cell, *cell2; + struct cell_key key, key2; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + struct new_mapping *m; + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool->prison, &key, bio, &cell)) + return; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + /* + * Check nobody is fiddling with this pool block. This can + * happen if someone's in the process of breaking sharing + * on this block. + */ + build_data_key(tc->td, lookup_result.block, &key2); + if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { + cell_release_singleton(cell, bio); + break; + } + + if (io_overlaps_block(pool, bio)) { + /* + * IO may still be going to the destination block. We must + * quiesce before we can do the removal. + */ + m = get_next_mapping(pool); + m->tc = tc; + m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; + m->virt_block = block; + m->data_block = lookup_result.block; + m->cell = cell; + m->cell2 = cell2; + m->err = 0; + m->bio = bio; + + if (!ds_add_work(&pool->all_io_ds, &m->list)) { + list_add(&m->list, &pool->prepared_discards); + wake_worker(pool); + } + } else { + /* + * This path is hit if people are ignoring + * limits->discard_granularity. It ignores any + * part of the discard that is in a subsequent + * block. + */ + sector_t offset = bio->bi_sector - (block << pool->block_shift); + unsigned remaining = (pool->sectors_per_block - offset) << 9; + bio->bi_size = min(bio->bi_size, remaining); + + cell_release_singleton(cell, bio); + cell_release_singleton(cell2, bio); + remap_and_issue(tc, bio, lookup_result.block); + } + break; + + case -ENODATA: + /* + * It isn't provisioned, just forget it. + */ + cell_release_singleton(cell, bio); + bio_endio(bio, 0); + break; + + default: + DMERR("discard: find block unexpectedly returned %d", r); + cell_release_singleton(cell, bio); + bio_io_error(bio); + break; + } +} + static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct cell_key *key, struct dm_thin_lookup_result *lookup_result, @@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, r = alloc_data_block(tc, &data_block); switch (r) { case 0: - schedule_copy(tc, block, lookup_result->block, - data_block, cell, bio); + schedule_internal_copy(tc, block, lookup_result->block, + data_block, cell, bio); break; case -ENOSPC: @@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, if (bio_data_dir(bio) == WRITE) break_sharing(tc, bio, block, &key, lookup_result, cell); else { - struct endio_hook *h; - h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - h->tc = tc; - h->entry = ds_inc(&pool->ds); - save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio); - dm_get_mapinfo(bio)->ptr = h; + h->shared_read_entry = ds_inc(&pool->shared_read_ds); cell_release_singleton(cell, bio); remap_and_issue(tc, bio, lookup_result->block); @@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block r = alloc_data_block(tc, &data_block); switch (r) { case 0: - schedule_zero(tc, block, data_block, cell, bio); + if (tc->origin_dev) + schedule_external_copy(tc, block, data_block, cell, bio); + else + schedule_zero(tc, block, data_block, cell, bio); break; case -ENOSPC: @@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio) break; case -ENODATA: - provision_block(tc, bio, block, cell); + if (bio_data_dir(bio) == READ && tc->origin_dev) { + cell_release_singleton(cell, bio); + remap_to_origin_and_issue(tc, bio); + } else + provision_block(tc, bio, block, cell); break; default: DMERR("dm_thin_find_block() failed, error = %d", r); + cell_release_singleton(cell, bio); bio_io_error(bio); break; } } +static int need_commit_due_to_time(struct pool *pool) +{ + return jiffies < pool->last_commit_jiffies || + jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; +} + static void process_deferred_bios(struct pool *pool) { unsigned long flags; @@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool) spin_unlock_irqrestore(&pool->lock, flags); while ((bio = bio_list_pop(&bios))) { - struct thin_c *tc = dm_get_mapinfo(bio)->ptr; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct thin_c *tc = h->tc; + /* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some @@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool) break; } - process_bio(tc, bio); + + if (bio->bi_rw & REQ_DISCARD) + process_discard(tc, bio); + else + process_bio(tc, bio); } /* @@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&pool->deferred_flush_bios); spin_unlock_irqrestore(&pool->lock, flags); - if (bio_list_empty(&bios)) + if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; r = dm_pool_commit_metadata(pool->pmd); @@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool) bio_io_error(bio); return; } + pool->last_commit_jiffies = jiffies; while ((bio = bio_list_pop(&bios))) generic_make_request(bio); @@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws) { struct pool *pool = container_of(ws, struct pool, worker); - process_prepared_mappings(pool); + process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); + process_prepared(pool, &pool->prepared_discards, process_prepared_discard); process_deferred_bios(pool); } +/* + * We want to commit periodically so that not too much + * unwritten data builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); + wake_worker(pool); + queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); +} + /*----------------------------------------------------------------*/ /* @@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } +static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + + h->tc = tc; + h->shared_read_entry = NULL; + h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); + h->overwrite_mapping = NULL; + + return h; +} + /* * Non-blocking function called from the thin target's map function. */ @@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - /* - * Save the thin context for easy access from the deferred bio later. - */ - map_context->ptr = tc; - - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { + map_context->ptr = thin_hook_bio(tc, bio); + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { thin_defer_bio(tc, bio); return DM_MAPIO_SUBMITTED; } @@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) pool->ti = ti; pool->low_water_blocks = pt->low_water_blocks; - pool->zero_new_blocks = pt->zero_new_blocks; + pool->pf = pt->pf; return 0; } @@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) /*---------------------------------------------------------------- * Pool creation *--------------------------------------------------------------*/ +/* Initialize pool features. */ +static void pool_features_init(struct pool_features *pf) +{ + pf->zero_new_blocks = 1; + pf->discard_enabled = 1; + pf->discard_passdown = 1; +} + static void __pool_destroy(struct pool *pool) { __pool_table_remove(pool); @@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->block_shift = ffs(block_size) - 1; pool->offset_mask = block_size - 1; pool->low_water_blocks = 0; - pool->zero_new_blocks = 1; + pool_features_init(&pool->pf); pool->prison = prison_create(PRISON_CELLS); if (!pool->prison) { *error = "Error creating pool's bio prison"; @@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md, } INIT_WORK(&pool->worker, do_worker); + INIT_DELAYED_WORK(&pool->waker, do_waker); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_bios); bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); + INIT_LIST_HEAD(&pool->prepared_discards); pool->low_water_triggered = 0; pool->no_free_space = 0; bio_list_init(&pool->retry_on_resume_list); - ds_init(&pool->ds); + ds_init(&pool->shared_read_ds); + ds_init(&pool->all_io_ds); pool->next_mapping = NULL; pool->mapping_pool = @@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_endio_hook_pool; } pool->ref_count = 1; + pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; pool->md_dev = metadata_dev; __pool_table_insert(pool); @@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool) static struct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, - unsigned long block_size, char **error) + unsigned long block_size, char **error, + int *created) { struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); @@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md, return ERR_PTR(-EINVAL); __pool_inc(pool); - } else + } else { pool = pool_create(pool_md, metadata_dev, block_size, error); + *created = 1; + } } return pool; @@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti) mutex_unlock(&dm_thin_pool_table.mutex); } -struct pool_features { - unsigned zero_new_blocks:1; -}; - static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, struct dm_target *ti) { @@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, const char *arg_name; static struct dm_arg _args[] = { - {0, 1, "Invalid number of pool feature arguments"}, + {0, 3, "Invalid number of pool feature arguments"}, }; /* @@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, if (!strcasecmp(arg_name, "skip_block_zeroing")) { pf->zero_new_blocks = 0; continue; + } else if (!strcasecmp(arg_name, "ignore_discard")) { + pf->discard_enabled = 0; + continue; + } else if (!strcasecmp(arg_name, "no_discard_passdown")) { + pf->discard_passdown = 0; + continue; } ti->error = "Unrecognised pool feature requested"; @@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, * * Optional feature arguments are: * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. + * ignore_discard: disable discard + * no_discard_passdown: don't pass discards down to the data device */ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) { - int r; + int r, pool_created = 0; struct pool_c *pt; struct pool *pool; struct pool_features pf; @@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_block_t low_water_blocks; struct dm_dev *metadata_dev; sector_t metadata_dev_size; + char b[BDEVNAME_SIZE]; /* * FIXME Remove validation from scope of lock. @@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) } metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { - ti->error = "Metadata device is too large"; - r = -EINVAL; - goto out_metadata; - } + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); if (r) { @@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) /* * Set default pool features. */ - memset(&pf, 0, sizeof(pf)); - pf.zero_new_blocks = 1; + pool_features_init(&pf); dm_consume_args(&as, 4); r = parse_pool_features(&as, &pf, ti); @@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) } pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, - block_size, &ti->error); + block_size, &ti->error, &pool_created); if (IS_ERR(pool)) { r = PTR_ERR(pool); goto out_free_pt; } + /* + * 'pool_created' reflects whether this is the first table load. + * Top level discard support is not allowed to be changed after + * initial load. This would require a pool reload to trigger thin + * device changes. + */ + if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { + ti->error = "Discard support cannot be disabled once enabled"; + r = -EINVAL; + goto out_flags_changed; + } + + /* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not; otherwise + * -EOPNOTSUPP will be returned. + */ + if (pf.discard_passdown) { + struct request_queue *q = bdev_get_queue(data_dev->bdev); + if (!q || !blk_queue_discard(q)) { + DMWARN("Discard unsupported by data device: Disabling discard passdown."); + pf.discard_passdown = 0; + } + } + pt->pool = pool; pt->ti = ti; pt->metadata_dev = metadata_dev; pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; - pt->zero_new_blocks = pf.zero_new_blocks; + pt->pf = pf; ti->num_flush_requests = 1; - ti->num_discard_requests = 0; + /* + * Only need to enable discards if the pool should pass + * them down to the data device. The thin device's discard + * processing will cause mappings to be removed from the btree. + */ + if (pf.discard_enabled && pf.discard_passdown) { + ti->num_discard_requests = 1; + /* + * Setting 'discards_supported' circumvents the normal + * stacking of discard limits (this keeps the pool and + * thin devices' discard limits consistent). + */ + ti->discards_supported = 1; + } ti->private = pt; pt->callbacks.congested_fn = pool_is_congested; @@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) return 0; +out_flags_changed: + __pool_dec(pool); out_free_pt: kfree(pt); out: @@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti) __requeue_bios(pool); spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); + do_waker(&pool->waker.work); } static void pool_postsuspend(struct dm_target *ti) @@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti) struct pool_c *pt = ti->private; struct pool *pool = pt->pool; + cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); r = dm_pool_commit_metadata(pool->pmd); @@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) static int pool_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) { - int r; + int r, count; unsigned sz = 0; uint64_t transaction_id; dm_block_t nr_free_blocks_data; @@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type, (unsigned long)pool->sectors_per_block, (unsigned long long)pt->low_water_blocks); - DMEMIT("%u ", !pool->zero_new_blocks); + count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + + !pool->pf.discard_passdown; + DMEMIT("%u ", count); - if (!pool->zero_new_blocks) + if (!pool->pf.zero_new_blocks) DMEMIT("skip_block_zeroing "); + + if (!pool->pf.discard_enabled) + DMEMIT("ignore_discard "); + + if (!pool->pf.discard_passdown) + DMEMIT("no_discard_passdown "); + break; } @@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static void set_discard_limits(struct pool *pool, struct queue_limits *limits) +{ + /* + * FIXME: these limits may be incompatible with the pool's data device + */ + limits->max_discard_sectors = pool->sectors_per_block; + + /* + * This is just a hint, and not enforced. We have to cope with + * bios that overlap 2 blocks. + */ + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + limits->discard_zeroes_data = pool->pf.zero_new_blocks; +} + static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; @@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, 0); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + if (pool->pf.discard_enabled) + set_discard_limits(pool, limits); } static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti) __pool_dec(tc->pool); dm_pool_close_thin_device(tc->td); dm_put_device(ti, tc->pool_dev); + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); kfree(tc); mutex_unlock(&dm_thin_pool_table.mutex); @@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti) /* * Thin target parameters: * - * <pool_dev> <dev_id> + * <pool_dev> <dev_id> [origin_dev] * * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) * dev_id: the internal device identifier + * origin_dev: a device external to the pool that should act as the origin + * + * If the pool device has discards disabled, they get disabled for the thin + * device as well. */ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) { int r; struct thin_c *tc; - struct dm_dev *pool_dev; + struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; mutex_lock(&dm_thin_pool_table.mutex); - if (argc != 2) { + if (argc != 2 && argc != 3) { ti->error = "Invalid argument count"; r = -EINVAL; goto out_unlock; @@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_unlock; } + if (argc == 3) { + r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + if (r) { + ti->error = "Error opening origin device"; + goto bad_origin_dev; + } + tc->origin_dev = origin_dev; + } + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); if (r) { ti->error = "Error opening pool device"; @@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->split_io = tc->pool->sectors_per_block; ti->num_flush_requests = 1; - ti->num_discard_requests = 0; - ti->discards_supported = 0; + + /* In case the pool supports discards, pass them on. */ + if (tc->pool->pf.discard_enabled) { + ti->discards_supported = 1; + ti->num_discard_requests = 1; + } dm_put(pool_md); @@ -2289,6 +2582,9 @@ bad_pool_lookup: bad_common: dm_put_device(ti, tc->pool_dev); bad_pool_dev: + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); +bad_origin_dev: kfree(tc); out_unlock: mutex_unlock(&dm_thin_pool_table.mutex); @@ -2299,11 +2595,46 @@ out_unlock: static int thin_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - bio->bi_sector -= ti->begin; + bio->bi_sector = dm_target_offset(ti, bio->bi_sector); return thin_bio_map(ti, bio, map_context); } +static int thin_endio(struct dm_target *ti, + struct bio *bio, int err, + union map_info *map_context) +{ + unsigned long flags; + struct endio_hook *h = map_context->ptr; + struct list_head work; + struct new_mapping *m, *tmp; + struct pool *pool = h->tc->pool; + + if (h->shared_read_entry) { + INIT_LIST_HEAD(&work); + ds_dec(h->shared_read_entry, &work); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) { + list_del(&m->list); + m->quiesced = 1; + __maybe_add_mapping(m); + } + spin_unlock_irqrestore(&pool->lock, flags); + } + + if (h->all_io_entry) { + INIT_LIST_HEAD(&work); + ds_dec(h->all_io_entry, &work); + list_for_each_entry_safe(m, tmp, &work, list) + list_add(&m->list, &pool->prepared_discards); + } + + mempool_free(h, pool->endio_hook_pool); + + return 0; +} + static void thin_postsuspend(struct dm_target *ti) { if (dm_noflush_suspending(ti)) @@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type, DMEMIT("%s %lu", format_dev_t(buf, tc->pool_dev->bdev->bd_dev), (unsigned long) tc->dev_id); + if (tc->origin_dev) + DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); break; } } @@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti, static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + set_discard_limits(pool, limits); } static struct target_type thin_target = { .name = "thin", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, .map = thin_map, + .end_io = thin_endio, .postsuspend = thin_postsuspend, .status = thin_status, .iterate_devices = thin_iterate_devices, |