diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 8 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.c | 415 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.h | 72 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 124 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 521 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 2 | ||||
-rw-r--r-- | drivers/md/dm.c | 178 | ||||
-rw-r--r-- | drivers/md/linear.c | 25 | ||||
-rw-r--r-- | drivers/md/md.c | 189 | ||||
-rw-r--r-- | drivers/md/md.h | 9 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.c | 4 | ||||
-rw-r--r-- | drivers/md/raid0.c | 20 | ||||
-rw-r--r-- | drivers/md/raid1.c | 37 | ||||
-rw-r--r-- | drivers/md/raid10.c | 95 | ||||
-rw-r--r-- | drivers/md/raid5.c | 219 | ||||
-rw-r--r-- | drivers/md/raid5.h | 1 |
23 files changed, 1228 insertions, 755 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d949b781f6f..91a02eeeb31 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -216,6 +216,13 @@ config DM_BUFIO as a cache, holding recently-read blocks in memory and performing delayed writes. +config DM_BIO_PRISON + tristate + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Some bio locking schemes used by other device-mapper targets + including thin provisioning. + source "drivers/md/persistent-data/Kconfig" config DM_CRYPT @@ -247,6 +254,7 @@ config DM_THIN_PROVISIONING tristate "Thin provisioning target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL select DM_PERSISTENT_DATA + select DM_BIO_PRISON ---help--- Provides thin provisioning and snapshots that share a data store. diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 8b2e0dffe82..94dce8b4932 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o +obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 94e7f6ba2e1..7155945f8eb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -163,20 +163,17 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter - * list_for_each_continue_rcu. + * list_for_each_entry_continue_rcu. */ - struct list_head *pos; rcu_read_lock(); if (rdev == NULL) /* start at the beginning */ - pos = &mddev->disks; + rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */ rdev_dec_pending(rdev, mddev); - pos = &rdev->same_set; } - list_for_each_continue_rcu(pos, &mddev->disks) { - rdev = list_entry(pos, struct md_rdev, same_set); + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ @@ -473,14 +470,10 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - int err = -EINVAL; bitmap->storage.sb_page = alloc_page(GFP_KERNEL); - if (IS_ERR(bitmap->storage.sb_page)) { - err = PTR_ERR(bitmap->storage.sb_page); - bitmap->storage.sb_page = NULL; - return err; - } + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; bitmap->storage.sb_page->index = 0; sb = kmap_atomic(bitmap->storage.sb_page); diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c new file mode 100644 index 00000000000..e4e84156745 --- /dev/null +++ b/drivers/md/dm-bio-prison.c @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison.h" + +#include <linux/spinlock.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> + +/*----------------------------------------------------------------*/ + +struct dm_bio_prison_cell { + struct hlist_node list; + struct dm_bio_prison *prison; + struct dm_cell_key key; + struct bio *holder; + struct bio_list bios; +}; + +struct dm_bio_prison { + spinlock_t lock; + mempool_t *cell_pool; + + unsigned nr_buckets; + unsigned hash_mask; + struct hlist_head *cells; +}; + +/*----------------------------------------------------------------*/ + +static uint32_t calc_nr_buckets(unsigned nr_cells) +{ + uint32_t n = 128; + + nr_cells /= 4; + nr_cells = min(nr_cells, 8192u); + + while (n < nr_cells) + n <<= 1; + + return n; +} + +static struct kmem_cache *_cell_cache; + +/* + * @nr_cells should be the number of cells you want in use _concurrently_. + * Don't confuse it with the number of distinct keys. + */ +struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) +{ + unsigned i; + uint32_t nr_buckets = calc_nr_buckets(nr_cells); + size_t len = sizeof(struct dm_bio_prison) + + (sizeof(struct hlist_head) * nr_buckets); + struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); + + if (!prison) + return NULL; + + spin_lock_init(&prison->lock); + prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); + if (!prison->cell_pool) { + kfree(prison); + return NULL; + } + + prison->nr_buckets = nr_buckets; + prison->hash_mask = nr_buckets - 1; + prison->cells = (struct hlist_head *) (prison + 1); + for (i = 0; i < nr_buckets; i++) + INIT_HLIST_HEAD(prison->cells + i); + + return prison; +} +EXPORT_SYMBOL_GPL(dm_bio_prison_create); + +void dm_bio_prison_destroy(struct dm_bio_prison *prison) +{ + mempool_destroy(prison->cell_pool); + kfree(prison); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); + +static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) +{ + const unsigned long BIG_PRIME = 4294967291UL; + uint64_t hash = key->block * BIG_PRIME; + + return (uint32_t) (hash & prison->hash_mask); +} + +static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) +{ + return (lhs->virtual == rhs->virtual) && + (lhs->dev == rhs->dev) && + (lhs->block == rhs->block); +} + +static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, + struct dm_cell_key *key) +{ + struct dm_bio_prison_cell *cell; + struct hlist_node *tmp; + + hlist_for_each_entry(cell, tmp, bucket, list) + if (keys_equal(&cell->key, key)) + return cell; + + return NULL; +} + +/* + * This may block if a new cell needs allocating. You must ensure that + * cells will be unlocked even if the calling thread is blocked. + * + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + */ +int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, + struct bio *inmate, struct dm_bio_prison_cell **ref) +{ + int r = 1; + unsigned long flags; + uint32_t hash = hash_key(prison, key); + struct dm_bio_prison_cell *cell, *cell2; + + BUG_ON(hash > prison->nr_buckets); + + spin_lock_irqsave(&prison->lock, flags); + + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + bio_list_add(&cell->bios, inmate); + goto out; + } + + /* + * Allocate a new cell + */ + spin_unlock_irqrestore(&prison->lock, flags); + cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); + spin_lock_irqsave(&prison->lock, flags); + + /* + * We've been unlocked, so we have to double check that + * nobody else has inserted this cell in the meantime. + */ + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + mempool_free(cell2, prison->cell_pool); + bio_list_add(&cell->bios, inmate); + goto out; + } + + /* + * Use new cell. + */ + cell = cell2; + + cell->prison = prison; + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = inmate; + bio_list_init(&cell->bios); + hlist_add_head(&cell->list, prison->cells + hash); + + r = 0; + +out: + spin_unlock_irqrestore(&prison->lock, flags); + + *ref = cell; + + return r; +} +EXPORT_SYMBOL_GPL(dm_bio_detain); + +/* + * @inmates must have been initialised prior to this call + */ +static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) +{ + struct dm_bio_prison *prison = cell->prison; + + hlist_del(&cell->list); + + if (inmates) { + bio_list_add(inmates, cell->holder); + bio_list_merge(inmates, &cell->bios); + } + + mempool_free(cell, prison->cell_pool); +} + +void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) +{ + unsigned long flags; + struct dm_bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release(cell, bios); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_release); + +/* + * There are a couple of places where we put a bio into a cell briefly + * before taking it out again. In these situations we know that no other + * bio may be in the cell. This function releases the cell, and also does + * a sanity check. + */ +static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) +{ + BUG_ON(cell->holder != bio); + BUG_ON(!bio_list_empty(&cell->bios)); + + __cell_release(cell, NULL); +} + +void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) +{ + unsigned long flags; + struct dm_bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_singleton(cell, bio); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_release_singleton); + +/* + * Sometimes we don't want the holder, just the additional bios. + */ +static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) +{ + struct dm_bio_prison *prison = cell->prison; + + hlist_del(&cell->list); + bio_list_merge(inmates, &cell->bios); + + mempool_free(cell, prison->cell_pool); +} + +void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) +{ + unsigned long flags; + struct dm_bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); + +void dm_cell_error(struct dm_bio_prison_cell *cell) +{ + struct dm_bio_prison *prison = cell->prison; + struct bio_list bios; + struct bio *bio; + unsigned long flags; + + bio_list_init(&bios); + + spin_lock_irqsave(&prison->lock, flags); + __cell_release(cell, &bios); + spin_unlock_irqrestore(&prison->lock, flags); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); +} +EXPORT_SYMBOL_GPL(dm_cell_error); + +/*----------------------------------------------------------------*/ + +#define DEFERRED_SET_SIZE 64 + +struct dm_deferred_entry { + struct dm_deferred_set *ds; + unsigned count; + struct list_head work_items; +}; + +struct dm_deferred_set { + spinlock_t lock; + unsigned current_entry; + unsigned sweeper; + struct dm_deferred_entry entries[DEFERRED_SET_SIZE]; +}; + +struct dm_deferred_set *dm_deferred_set_create(void) +{ + int i; + struct dm_deferred_set *ds; + + ds = kmalloc(sizeof(*ds), GFP_KERNEL); + if (!ds) + return NULL; + + spin_lock_init(&ds->lock); + ds->current_entry = 0; + ds->sweeper = 0; + for (i = 0; i < DEFERRED_SET_SIZE; i++) { + ds->entries[i].ds = ds; + ds->entries[i].count = 0; + INIT_LIST_HEAD(&ds->entries[i].work_items); + } + + return ds; +} +EXPORT_SYMBOL_GPL(dm_deferred_set_create); + +void dm_deferred_set_destroy(struct dm_deferred_set *ds) +{ + kfree(ds); +} +EXPORT_SYMBOL_GPL(dm_deferred_set_destroy); + +struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds) +{ + unsigned long flags; + struct dm_deferred_entry *entry; + + spin_lock_irqsave(&ds->lock, flags); + entry = ds->entries + ds->current_entry; + entry->count++; + spin_unlock_irqrestore(&ds->lock, flags); + + return entry; +} +EXPORT_SYMBOL_GPL(dm_deferred_entry_inc); + +static unsigned ds_next(unsigned index) +{ + return (index + 1) % DEFERRED_SET_SIZE; +} + +static void __sweep(struct dm_deferred_set *ds, struct list_head *head) +{ + while ((ds->sweeper != ds->current_entry) && + !ds->entries[ds->sweeper].count) { + list_splice_init(&ds->entries[ds->sweeper].work_items, head); + ds->sweeper = ds_next(ds->sweeper); + } + + if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) + list_splice_init(&ds->entries[ds->sweeper].work_items, head); +} + +void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&entry->ds->lock, flags); + BUG_ON(!entry->count); + --entry->count; + __sweep(entry->ds, head); + spin_unlock_irqrestore(&entry->ds->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); + +/* + * Returns 1 if deferred or 0 if no pending items to delay job. + */ +int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) +{ + int r = 1; + unsigned long flags; + unsigned next_entry; + + spin_lock_irqsave(&ds->lock, flags); + if ((ds->sweeper == ds->current_entry) && + !ds->entries[ds->current_entry].count) + r = 0; + else { + list_add(work, &ds->entries[ds->current_entry].work_items); + next_entry = ds_next(ds->current_entry); + if (!ds->entries[next_entry].count) + ds->current_entry = next_entry; + } + spin_unlock_irqrestore(&ds->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_deferred_set_add_work); + +/*----------------------------------------------------------------*/ + +static int __init dm_bio_prison_init(void) +{ + _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); + if (!_cell_cache) + return -ENOMEM; + + return 0; +} + +static void __exit dm_bio_prison_exit(void) +{ + kmem_cache_destroy(_cell_cache); + _cell_cache = NULL; +} + +/* + * module hooks + */ +module_init(dm_bio_prison_init); +module_exit(dm_bio_prison_exit); + +MODULE_DESCRIPTION(DM_NAME " bio prison"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h new file mode 100644 index 00000000000..4e0ac376700 --- /dev/null +++ b/drivers/md/dm-bio-prison.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2011-2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_PRISON_H +#define DM_BIO_PRISON_H + +#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ +#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ + +#include <linux/list.h> +#include <linux/bio.h> + +/*----------------------------------------------------------------*/ + +/* + * Sometimes we can't deal with a bio straight away. We put them in prison + * where they can't cause any mischief. Bios are put in a cell identified + * by a key, multiple bios can be in the same cell. When the cell is + * subsequently unlocked the bios become available. + */ +struct dm_bio_prison; +struct dm_bio_prison_cell; + +/* FIXME: this needs to be more abstract */ +struct dm_cell_key { + int virtual; + dm_thin_id dev; + dm_block_t block; +}; + +struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); +void dm_bio_prison_destroy(struct dm_bio_prison *prison); + +/* + * This may block if a new cell needs allocating. You must ensure that + * cells will be unlocked even if the calling thread is blocked. + * + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + */ +int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, + struct bio *inmate, struct dm_bio_prison_cell **ref); + +void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios); +void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio); // FIXME: bio arg not needed +void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates); +void dm_cell_error(struct dm_bio_prison_cell *cell); + +/*----------------------------------------------------------------*/ + +/* + * We use the deferred set to keep track of pending reads to shared blocks. + * We do this to ensure the new mapping caused by a write isn't performed + * until these prior reads have completed. Otherwise the insertion of the + * new mapping could free the old block that the read bios are mapped to. + */ + +struct dm_deferred_set; +struct dm_deferred_entry; + +struct dm_deferred_set *dm_deferred_set_create(void); +void dm_deferred_set_destroy(struct dm_deferred_set *ds); + +struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds); +void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head); +int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cc06a1e5242..651ca79881d 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -280,9 +280,7 @@ static void __cache_size_refresh(void) BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); BUG_ON(dm_bufio_client_count < 0); - dm_bufio_cache_size_latch = dm_bufio_cache_size; - - barrier(); + dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); /* * Use default if set to 0 and report the actual cache size used. @@ -441,8 +439,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty) c->n_buffers[b->list_mode]--; c->n_buffers[dirty]++; b->list_mode = dirty; - list_del(&b->lru_list); - list_add(&b->lru_list, &c->lru[dirty]); + list_move(&b->lru_list, &c->lru[dirty]); } /*---------------------------------------------------------------- @@ -813,7 +810,7 @@ static void __get_memory_limit(struct dm_bufio_client *c, { unsigned long buffers; - if (dm_bufio_cache_size != dm_bufio_cache_size_latch) { + if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { mutex_lock(&dm_bufio_clients_lock); __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); @@ -1591,11 +1588,9 @@ EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); static void cleanup_old_buffers(void) { - unsigned long max_age = dm_bufio_max_age; + unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); struct dm_bufio_client *c; - barrier(); - if (max_age > ULONG_MAX / HZ) max_age = ULONG_MAX / HZ; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 664743d6a6c..bbf459bca61 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -798,14 +798,6 @@ static int crypt_convert(struct crypt_config *cc, return 0; } -static void dm_crypt_bio_destructor(struct bio *bio) -{ - struct dm_crypt_io *io = bio->bi_private; - struct crypt_config *cc = io->cc; - - bio_free(bio, cc->bs); -} - /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations @@ -974,7 +966,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; clone->bi_rw = io->base_bio->bi_rw; - clone->bi_destructor = dm_crypt_bio_destructor; } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) @@ -988,19 +979,14 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + clone = bio_clone_bioset(base_bio, gfp, cc->bs); if (!clone) return 1; crypt_inc_pending(io); clone_init(io, clone); - clone->bi_idx = 0; - clone->bi_vcnt = bio_segments(base_bio); - clone->bi_size = base_bio->bi_size; clone->bi_sector = cc->start + io->sector; - memcpy(clone->bi_io_vec, bio_iovec(base_bio), - sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); return 0; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea5dd289fe2..1c46f97d666 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -249,16 +249,6 @@ static void vm_dp_init(struct dpages *dp, void *data) dp->context_ptr = data; } -static void dm_bio_destructor(struct bio *bio) -{ - unsigned region; - struct io *io; - - retrieve_io_and_region_from_bio(bio, &io, ®ion); - - bio_free(bio, io->client->bios); -} - /* * Functions for getting the pages from kernel memory. */ @@ -317,7 +307,6 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; - bio->bi_destructor = dm_bio_destructor; store_io_and_region_in_bio(bio, io, region); if (rw & REQ_DISCARD) { diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d778563a4ff..573bd04591b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1309,13 +1309,14 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, { struct multipath *m = ti->private; struct dm_mpath_io *mpio = map_context->ptr; - struct pgpath *pgpath = mpio->pgpath; + struct pgpath *pgpath; struct path_selector *ps; int r; BUG_ON(!mpio); r = do_end_io(m, clone, error, mpio); + pgpath = mpio->pgpath; if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 982e3e390c4..45d94a7e7f6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -338,6 +338,84 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) } /* + * validate_rebuild_devices + * @rs + * + * Determine if the devices specified for rebuild can result in a valid + * usable array that is capable of rebuilding the given devices. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_rebuild_devices(struct raid_set *rs) +{ + unsigned i, rebuild_cnt = 0; + unsigned rebuilds_per_group, copies, d; + + if (!(rs->print_flags & DMPF_REBUILD)) + return 0; + + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild_cnt++; + + switch (rs->raid_type->level) { + case 1: + if (rebuild_cnt >= rs->md.raid_disks) + goto too_many; + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) + goto too_many; + break; + case 10: + copies = raid10_md_layout_to_copies(rs->md.layout); + if (rebuild_cnt < copies) + break; + + /* + * It is possible to have a higher rebuild count for RAID10, + * as long as the failed devices occur in different mirror + * groups (i.e. different stripes). + * + * Right now, we only allow for "near" copies. When other + * formats are added, we will have to check those too. + * + * When checking "near" format, make sure no adjacent devices + * have failed beyond what can be handled. In addition to the + * simple case where the number of devices is a multiple of the + * number of copies, we must also handle cases where the number + * of devices is not a multiple of the number of copies. + * E.g. dev1 dev2 dev3 dev4 dev5 + * A A B B C + * C D D E E + */ + rebuilds_per_group = 0; + for (i = 0; i < rs->md.raid_disks * copies; i++) { + d = i % rs->md.raid_disks; + if (!test_bit(In_sync, &rs->dev[d].rdev.flags) && + (++rebuilds_per_group >= copies)) + goto too_many; + if (!((i + 1) % copies)) + rebuilds_per_group = 0; + } + break; + default: + DMERR("The rebuild parameter is not supported for %s", + rs->raid_type->name); + rs->ti->error = "Rebuild not supported for this RAID type"; + return -EINVAL; + } + + return 0; + +too_many: + rs->ti->error = "Too many rebuild devices specified"; + return -EINVAL; +} + +/* * Possible arguments are... * <chunk_size> [optional_args] * @@ -365,7 +443,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, { char *raid10_format = "near"; unsigned raid10_copies = 2; - unsigned i, rebuild_cnt = 0; + unsigned i; unsigned long value, region_size = 0; sector_t sectors_per_dev = rs->ti->len; sector_t max_io_len; @@ -461,31 +539,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { - rebuild_cnt++; - - switch (rs->raid_type->level) { - case 1: - if (rebuild_cnt >= rs->md.raid_disks) { - rs->ti->error = "Too many rebuild devices specified"; - return -EINVAL; - } - break; - case 4: - case 5: - case 6: - if (rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild devices specified for given RAID type"; - return -EINVAL; - } - break; - case 10: - default: - DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); - rs->ti->error = "Rebuild not supported for this RAID type"; - return -EINVAL; - } - - if (value > rs->md.raid_disks) { + if (value >= rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; } @@ -608,6 +662,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } rs->md.dev_sectors = sectors_per_dev; + if (validate_rebuild_devices(rs)) + return -EINVAL; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -960,6 +1017,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) { + /* + * Skipping super_load due to DMPF_SYNC will cause + * the array to undergo initialization again as + * though it were new. This is the intended effect + * of the "sync" directive. + * + * When reshaping capability is added, we must ensure + * that the "sync" directive is disallowed during the + * reshape. + */ + if (rs->print_flags & DMPF_SYNC) + continue; + if (!rdev->meta_bdev) continue; @@ -1360,7 +1430,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c29410af1e2..058acf3a5ba 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -5,6 +5,7 @@ */ #include "dm-thin-metadata.h" +#include "dm-bio-prison.h" #include "dm.h" #include <linux/device-mapper.h> @@ -21,7 +22,6 @@ * Tunable constants */ #define ENDIO_HOOK_POOL_SIZE 1024 -#define DEFERRED_SET_SIZE 64 #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ @@ -58,7 +58,7 @@ * i) plug io further to this physical block. (see bio_prison code). * * ii) quiesce any read io to that shared data block. Obviously - * including all devices that share this block. (see deferred_set code) + * including all devices that share this block. (see dm_deferred_set code) * * iii) copy the data block to a newly allocate block. This step can be * missed out if the io covers the block. (schedule_copy). @@ -99,381 +99,10 @@ /*----------------------------------------------------------------*/ /* - * Sometimes we can't deal with a bio straight away. We put them in prison - * where they can't cause any mischief. Bios are put in a cell identified - * by a key, multiple bios can be in the same cell. When the cell is - * subsequently unlocked the bios become available. - */ -struct bio_prison; - -struct cell_key { - int virtual; - dm_thin_id dev; - dm_block_t block; -}; - -struct dm_bio_prison_cell { - struct hlist_node list; - struct bio_prison *prison; - struct cell_key key; - struct bio *holder; - struct bio_list bios; -}; - -struct bio_prison { - spinlock_t lock; - mempool_t *cell_pool; - - unsigned nr_buckets; - unsigned hash_mask; - struct hlist_head *cells; -}; - -static uint32_t calc_nr_buckets(unsigned nr_cells) -{ - uint32_t n = 128; - - nr_cells /= 4; - nr_cells = min(nr_cells, 8192u); - - while (n < nr_cells) - n <<= 1; - - return n; -} - -static struct kmem_cache *_cell_cache; - -/* - * @nr_cells should be the number of cells you want in use _concurrently_. - * Don't confuse it with the number of distinct keys. - */ -static struct bio_prison *prison_create(unsigned nr_cells) -{ - unsigned i; - uint32_t nr_buckets = calc_nr_buckets(nr_cells); - size_t len = sizeof(struct bio_prison) + - (sizeof(struct hlist_head) * nr_buckets); - struct bio_prison *prison = kmalloc(len, GFP_KERNEL); - - if (!prison) - return NULL; - - spin_lock_init(&prison->lock); - prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); - if (!prison->cell_pool) { - kfree(prison); - return NULL; - } - - prison->nr_buckets = nr_buckets; - prison->hash_mask = nr_buckets - 1; - prison->cells = (struct hlist_head *) (prison + 1); - for (i = 0; i < nr_buckets; i++) - INIT_HLIST_HEAD(prison->cells + i); - - return prison; -} - -static void prison_destroy(struct bio_prison *prison) -{ - mempool_destroy(prison->cell_pool); - kfree(prison); -} - -static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) -{ - const unsigned long BIG_PRIME = 4294967291UL; - uint64_t hash = key->block * BIG_PRIME; - - return (uint32_t) (hash & prison->hash_mask); -} - -static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) -{ - return (lhs->virtual == rhs->virtual) && - (lhs->dev == rhs->dev) && - (lhs->block == rhs->block); -} - -static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, - struct cell_key *key) -{ - struct dm_bio_prison_cell *cell; - struct hlist_node *tmp; - - hlist_for_each_entry(cell, tmp, bucket, list) - if (keys_equal(&cell->key, key)) - return cell; - - return NULL; -} - -/* - * This may block if a new cell needs allocating. You must ensure that - * cells will be unlocked even if the calling thread is blocked. - * - * Returns 1 if the cell was already held, 0 if @inmate is the new holder. - */ -static int bio_detain(struct bio_prison *prison, struct cell_key *key, - struct bio *inmate, struct dm_bio_prison_cell **ref) -{ - int r = 1; - unsigned long flags; - uint32_t hash = hash_key(prison, key); - struct dm_bio_prison_cell *cell, *cell2; - - BUG_ON(hash > prison->nr_buckets); - - spin_lock_irqsave(&prison->lock, flags); - - cell = __search_bucket(prison->cells + hash, key); - if (cell) { - bio_list_add(&cell->bios, inmate); - goto out; - } - - /* - * Allocate a new cell - */ - spin_unlock_irqrestore(&prison->lock, flags); - cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); - spin_lock_irqsave(&prison->lock, flags); - - /* - * We've been unlocked, so we have to double check that - * nobody else has inserted this cell in the meantime. - */ - cell = __search_bucket(prison->cells + hash, key); - if (cell) { - mempool_free(cell2, prison->cell_pool); - bio_list_add(&cell->bios, inmate); - goto out; - } - - /* - * Use new cell. - */ - cell = cell2; - - cell->prison = prison; - memcpy(&cell->key, key, sizeof(cell->key)); - cell->holder = inmate; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); - - r = 0; - -out: - spin_unlock_irqrestore(&prison->lock, flags); - - *ref = cell; - - return r; -} - -/* - * @inmates must have been initialised prior to this call - */ -static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) -{ - struct bio_prison *prison = cell->prison; - - hlist_del(&cell->list); - - if (inmates) { - bio_list_add(inmates, cell->holder); - bio_list_merge(inmates, &cell->bios); - } - - mempool_free(cell, prison->cell_pool); -} - -static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) -{ - unsigned long flags; - struct bio_prison *prison = cell->prison; - - spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); -} - -/* - * There are a couple of places where we put a bio into a cell briefly - * before taking it out again. In these situations we know that no other - * bio may be in the cell. This function releases the cell, and also does - * a sanity check. - */ -static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) -{ - BUG_ON(cell->holder != bio); - BUG_ON(!bio_list_empty(&cell->bios)); - - __cell_release(cell, NULL); -} - -static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) -{ - unsigned long flags; - struct bio_prison *prison = cell->prison; - - spin_lock_irqsave(&prison->lock, flags); - __cell_release_singleton(cell, bio); - spin_unlock_irqrestore(&prison->lock, flags); -} - -/* - * Sometimes we don't want the holder, just the additional bios. - */ -static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, - struct bio_list *inmates) -{ - struct bio_prison *prison = cell->prison; - - hlist_del(&cell->list); - bio_list_merge(inmates, &cell->bios); - - mempool_free(cell, prison->cell_pool); -} - -static void cell_release_no_holder(struct dm_bio_prison_cell *cell, - struct bio_list *inmates) -{ - unsigned long flags; - struct bio_prison *prison = cell->prison; - - spin_lock_irqsave(&prison->lock, flags); - __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&prison->lock, flags); -} - -static void cell_error(struct dm_bio_prison_cell *cell) -{ - struct bio_prison *prison = cell->prison; - struct bio_list bios; - struct bio *bio; - unsigned long flags; - - bio_list_init(&bios); - - spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, &bios); - spin_unlock_irqrestore(&prison->lock, flags); - - while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); -} - -/*----------------------------------------------------------------*/ - -/* - * We use the deferred set to keep track of pending reads to shared blocks. - * We do this to ensure the new mapping caused by a write isn't performed - * until these prior reads have completed. Otherwise the insertion of the - * new mapping could free the old block that the read bios are mapped to. - */ - -struct deferred_set; -struct deferred_entry { - struct deferred_set *ds; - unsigned count; - struct list_head work_items; -}; - -struct deferred_set { - spinlock_t lock; - unsigned current_entry; - unsigned sweeper; - struct deferred_entry entries[DEFERRED_SET_SIZE]; -}; - -static void ds_init(struct deferred_set *ds) -{ - int i; - - spin_lock_init(&ds->lock); - ds->current_entry = 0; - ds->sweeper = 0; - for (i = 0; i < DEFERRED_SET_SIZE; i++) { - ds->entries[i].ds = ds; - ds->entries[i].count = 0; - INIT_LIST_HEAD(&ds->entries[i].work_items); - } -} - -static struct deferred_entry *ds_inc(struct deferred_set *ds) -{ - unsigned long flags; - struct deferred_entry *entry; - - spin_lock_irqsave(&ds->lock, flags); - entry = ds->entries + ds->current_entry; - entry->count++; - spin_unlock_irqrestore(&ds->lock, flags); - - return entry; -} - -static unsigned ds_next(unsigned index) -{ - return (index + 1) % DEFERRED_SET_SIZE; -} - -static void __sweep(struct deferred_set *ds, struct list_head *head) -{ - while ((ds->sweeper != ds->current_entry) && - !ds->entries[ds->sweeper].count) { - list_splice_init(&ds->entries[ds->sweeper].work_items, head); - ds->sweeper = ds_next(ds->sweeper); - } - - if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) - list_splice_init(&ds->entries[ds->sweeper].work_items, head); -} - -static void ds_dec(struct deferred_entry *entry, struct list_head *head) -{ - unsigned long flags; - - spin_lock_irqsave(&entry->ds->lock, flags); - BUG_ON(!entry->count); - --entry->count; - __sweep(entry->ds, head); - spin_unlock_irqrestore(&entry->ds->lock, flags); -} - -/* - * Returns 1 if deferred or 0 if no pending items to delay job. - */ -static int ds_add_work(struct deferred_set *ds, struct list_head *work) -{ - int r = 1; - unsigned long flags; - unsigned next_entry; - - spin_lock_irqsave(&ds->lock, flags); - if ((ds->sweeper == ds->current_entry) && - !ds->entries[ds->current_entry].count) - r = 0; - else { - list_add(work, &ds->entries[ds->current_entry].work_items); - next_entry = ds_next(ds->current_entry); - if (!ds->entries[next_entry].count) - ds->current_entry = next_entry; - } - spin_unlock_irqrestore(&ds->lock, flags); - - return r; -} - -/*----------------------------------------------------------------*/ - -/* * Key building. */ static void build_data_key(struct dm_thin_device *td, - dm_block_t b, struct cell_key *key) + dm_block_t b, struct dm_cell_key *key) { key->virtual = 0; key->dev = dm_thin_dev_id(td); @@ -481,7 +110,7 @@ static void build_data_key(struct dm_thin_device *td, } static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, - struct cell_key *key) + struct dm_cell_key *key) { key->virtual = 1; key->dev = dm_thin_dev_id(td); @@ -534,7 +163,7 @@ struct pool { unsigned low_water_triggered:1; /* A dm event has been sent */ unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ - struct bio_prison *prison; + struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; struct workqueue_struct *wq; @@ -552,8 +181,8 @@ struct pool { struct bio_list retry_on_resume_list; - struct deferred_set shared_read_ds; - struct deferred_set all_io_ds; + struct dm_deferred_set *shared_read_ds; + struct dm_deferred_set *all_io_ds; struct dm_thin_new_mapping *next_mapping; mempool_t *mapping_pool; @@ -660,8 +289,8 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev struct dm_thin_endio_hook { struct thin_c *tc; - struct deferred_entry *shared_read_entry; - struct deferred_entry *all_io_entry; + struct dm_deferred_entry *shared_read_entry; + struct dm_deferred_entry *all_io_entry; struct dm_thin_new_mapping *overwrite_mapping; }; @@ -877,7 +506,7 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, unsigned long flags; spin_lock_irqsave(&pool->lock, flags); - cell_release(cell, &pool->deferred_bios); + dm_cell_release(cell, &pool->deferred_bios); spin_unlock_irqrestore(&tc->pool->lock, flags); wake_worker(pool); @@ -896,7 +525,7 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell bio_list_init(&bios); spin_lock_irqsave(&pool->lock, flags); - cell_release_no_holder(cell, &pool->deferred_bios); + dm_cell_release_no_holder(cell, &pool->deferred_bios); spin_unlock_irqrestore(&pool->lock, flags); wake_worker(pool); @@ -906,7 +535,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { if (m->bio) m->bio->bi_end_io = m->saved_bi_end_io; - cell_error(m->cell); + dm_cell_error(m->cell); list_del(&m->list); mempool_free(m, m->tc->pool->mapping_pool); } @@ -921,7 +550,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) bio->bi_end_io = m->saved_bi_end_io; if (m->err) { - cell_error(m->cell); + dm_cell_error(m->cell); goto out; } @@ -933,7 +562,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { DMERR("dm_thin_insert_block() failed"); - cell_error(m->cell); + dm_cell_error(m->cell); goto out; } @@ -1067,7 +696,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, m->err = 0; m->bio = NULL; - if (!ds_add_work(&pool->shared_read_ds, &m->list)) + if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) m->quiesced = 1; /* @@ -1099,7 +728,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, if (r < 0) { mempool_free(m, pool->mapping_pool); DMERR("dm_kcopyd_copy() failed"); - cell_error(cell); + dm_cell_error(cell); } } } @@ -1164,7 +793,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, if (r < 0) { mempool_free(m, pool->mapping_pool); DMERR("dm_kcopyd_zero() failed"); - cell_error(cell); + dm_cell_error(cell); } } } @@ -1276,7 +905,7 @@ static void no_space(struct dm_bio_prison_cell *cell) struct bio_list bios; bio_list_init(&bios); - cell_release(cell, &bios); + dm_cell_release(cell, &bios); while ((bio = bio_list_pop(&bios))) retry_on_resume(bio); @@ -1288,13 +917,13 @@ static void process_discard(struct thin_c *tc, struct bio *bio) unsigned long flags; struct pool *pool = tc->pool; struct dm_bio_prison_cell *cell, *cell2; - struct cell_key key, key2; + struct dm_cell_key key, key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; struct dm_thin_new_mapping *m; build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool->prison, &key, bio, &cell)) + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) return; r = dm_thin_find_block(tc->td, block, 1, &lookup_result); @@ -1306,8 +935,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio) * on this block. */ build_data_key(tc->td, lookup_result.block, &key2); - if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { - cell_release_singleton(cell, bio); + if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) { + dm_cell_release_singleton(cell, bio); break; } @@ -1326,7 +955,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) m->err = 0; m->bio = bio; - if (!ds_add_work(&pool->all_io_ds, &m->list)) { + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { spin_lock_irqsave(&pool->lock, flags); list_add(&m->list, &pool->prepared_discards); spin_unlock_irqrestore(&pool->lock, flags); @@ -1338,8 +967,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio) * a block boundary. So we submit the discard of a * partial block appropriately. */ - cell_release_singleton(cell, bio); - cell_release_singleton(cell2, bio); + dm_cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell2, bio); if ((!lookup_result.shared) && pool->pf.discard_passdown) remap_and_issue(tc, bio, lookup_result.block); else @@ -1351,20 +980,20 @@ static void process_discard(struct thin_c *tc, struct bio *bio) /* * It isn't provisioned, just forget it. */ - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); bio_endio(bio, 0); break; default: DMERR("discard: find block unexpectedly returned %d", r); - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); bio_io_error(bio); break; } } static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct cell_key *key, + struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, struct dm_bio_prison_cell *cell) { @@ -1384,7 +1013,7 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, default: DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); - cell_error(cell); + dm_cell_error(cell); break; } } @@ -1395,14 +1024,14 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, { struct dm_bio_prison_cell *cell; struct pool *pool = tc->pool; - struct cell_key key; + struct dm_cell_key key; /* * If cell is already occupied, then sharing is already in the process * of being broken so we have nothing further to do here. */ build_data_key(tc->td, lookup_result->block, &key); - if (bio_detain(pool->prison, &key, bio, &cell)) + if (dm_bio_detain(pool->prison, &key, bio, &cell)) return; if (bio_data_dir(bio) == WRITE && bio->bi_size) @@ -1410,9 +1039,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, else { struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; - h->shared_read_entry = ds_inc(&pool->shared_read_ds); + h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); remap_and_issue(tc, bio, lookup_result->block); } } @@ -1427,7 +1056,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block * Remap empty bios (flushes) immediately, without provisioning. */ if (!bio->bi_size) { - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); remap_and_issue(tc, bio, 0); return; } @@ -1437,7 +1066,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block */ if (bio_data_dir(bio) == READ) { zero_fill_bio(bio); - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); bio_endio(bio, 0); return; } @@ -1458,7 +1087,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block default: DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); set_pool_mode(tc->pool, PM_READ_ONLY); - cell_error(cell); + dm_cell_error(cell); break; } } @@ -1468,7 +1097,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) int r; dm_block_t block = get_bio_block(tc, bio); struct dm_bio_prison_cell *cell; - struct cell_key key; + struct dm_cell_key key; struct dm_thin_lookup_result lookup_result; /* @@ -1476,7 +1105,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) * being provisioned so we have nothing further to do here. */ build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool->prison, &key, bio, &cell)) + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) return; r = dm_thin_find_block(tc->td, block, 1, &lookup_result); @@ -1491,7 +1120,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) * TODO: this will probably have to change when discard goes * back in. */ - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); if (lookup_result.shared) process_shared_bio(tc, bio, block, &lookup_result); @@ -1501,7 +1130,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) case -ENODATA: if (bio_data_dir(bio) == READ && tc->origin_dev) { - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); remap_to_origin_and_issue(tc, bio); } else provision_block(tc, bio, block, cell); @@ -1509,7 +1138,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) default: DMERR("dm_thin_find_block() failed, error = %d", r); - cell_release_singleton(cell, bio); + dm_cell_release_singleton(cell, bio); bio_io_error(bio); break; } @@ -1718,7 +1347,7 @@ static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *b h->tc = tc; h->shared_read_entry = NULL; - h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); + h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds); h->overwrite_mapping = NULL; return h; @@ -1928,7 +1557,7 @@ static void __pool_destroy(struct pool *pool) if (dm_pool_metadata_close(pool->pmd) < 0) DMWARN("%s: dm_pool_metadata_close() failed.", __func__); - prison_destroy(pool->prison); + dm_bio_prison_destroy(pool->prison); dm_kcopyd_client_destroy(pool->copier); if (pool->wq) @@ -1938,6 +1567,8 @@ static void __pool_destroy(struct pool *pool) mempool_free(pool->next_mapping, pool->mapping_pool); mempool_destroy(pool->mapping_pool); mempool_destroy(pool->endio_hook_pool); + dm_deferred_set_destroy(pool->shared_read_ds); + dm_deferred_set_destroy(pool->all_io_ds); kfree(pool); } @@ -1976,7 +1607,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->sectors_per_block_shift = __ffs(block_size); pool->low_water_blocks = 0; pool_features_init(&pool->pf); - pool->prison = prison_create(PRISON_CELLS); + pool->prison = dm_bio_prison_create(PRISON_CELLS); if (!pool->prison) { *error = "Error creating pool's bio prison"; err_p = ERR_PTR(-ENOMEM); @@ -2012,8 +1643,20 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->low_water_triggered = 0; pool->no_free_space = 0; bio_list_init(&pool->retry_on_resume_list); - ds_init(&pool->shared_read_ds); - ds_init(&pool->all_io_ds); + + pool->shared_read_ds = dm_deferred_set_create(); + if (!pool->shared_read_ds) { + *error = "Error creating pool's shared read deferred set"; + err_p = ERR_PTR(-ENOMEM); + goto bad_shared_read_ds; + } + + pool->all_io_ds = dm_deferred_set_create(); + if (!pool->all_io_ds) { + *error = "Error creating pool's all io deferred set"; + err_p = ERR_PTR(-ENOMEM); + goto bad_all_io_ds; + } pool->next_mapping = NULL; pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, @@ -2042,11 +1685,15 @@ static struct pool *pool_create(struct mapped_device *pool_md, bad_endio_hook_pool: mempool_destroy(pool->mapping_pool); bad_mapping_pool: + dm_deferred_set_destroy(pool->all_io_ds); +bad_all_io_ds: + dm_deferred_set_destroy(pool->shared_read_ds); +bad_shared_read_ds: destroy_workqueue(pool->wq); bad_wq: dm_kcopyd_client_destroy(pool->copier); bad_kcopyd_client: - prison_destroy(pool->prison); + dm_bio_prison_destroy(pool->prison); bad_prison: kfree(pool); bad_pool: @@ -2272,15 +1919,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_flags_changed; } - /* - * The block layer requires discard_granularity to be a power of 2. - */ - if (pf.discard_enabled && !is_power_of_2(block_size)) { - ti->error = "Discard support must be disabled when the block size is not a power of 2"; - r = -EINVAL; - goto out_flags_changed; - } - pt->pool = pool; pt->ti = ti; pt->metadata_dev = metadata_dev; @@ -2762,6 +2400,11 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static bool block_size_is_power_of_two(struct pool *pool) +{ + return pool->sectors_per_block_shift >= 0; +} + static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) { struct pool *pool = pt->pool; @@ -2775,8 +2418,15 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) if (pt->adjusted_pf.discard_passdown) { data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; limits->discard_granularity = data_limits->discard_granularity; - } else + } else if (block_size_is_power_of_two(pool)) limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + else + /* + * Use largest power of 2 that is a factor of sectors_per_block + * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS. + */ + limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1), + DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT; } static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2804,7 +2454,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2979,7 +2629,7 @@ static int thin_endio(struct dm_target *ti, if (h->shared_read_entry) { INIT_LIST_HEAD(&work); - ds_dec(h->shared_read_entry, &work); + dm_deferred_entry_dec(h->shared_read_entry, &work); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) { @@ -2992,7 +2642,7 @@ static int thin_endio(struct dm_target *ti, if (h->all_io_entry) { INIT_LIST_HEAD(&work); - ds_dec(h->all_io_entry, &work); + dm_deferred_entry_dec(h->all_io_entry, &work); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) list_add(&m->list, &pool->prepared_discards); @@ -3095,7 +2745,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, @@ -3125,10 +2775,6 @@ static int __init dm_thin_init(void) r = -ENOMEM; - _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); - if (!_cell_cache) - goto bad_cell_cache; - _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); if (!_new_mapping_cache) goto bad_new_mapping_cache; @@ -3142,8 +2788,6 @@ static int __init dm_thin_init(void) bad_endio_hook_cache: kmem_cache_destroy(_new_mapping_cache); bad_new_mapping_cache: - kmem_cache_destroy(_cell_cache); -bad_cell_cache: dm_unregister_target(&pool_target); bad_pool_target: dm_unregister_target(&thin_target); @@ -3156,7 +2800,6 @@ static void dm_thin_exit(void) dm_unregister_target(&thin_target); dm_unregister_target(&pool_target); - kmem_cache_destroy(_cell_cache); kmem_cache_destroy(_new_mapping_cache); kmem_cache_destroy(_endio_hook_cache); } diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 892ae2766aa..9e7328bb403 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -438,7 +438,7 @@ static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { - unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; + unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); cluster >>= v->data_dev_block_bits; if (unlikely(!cluster)) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 67ffa391edc..02db9183ca0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,6 +71,7 @@ struct dm_target_io { struct dm_io *io; struct dm_target *ti; union map_info info; + struct bio clone; }; /* @@ -86,12 +87,17 @@ struct dm_rq_target_io { }; /* - * For request-based dm. - * One of these is allocated per bio. + * For request-based dm - the bio clones we allocate are embedded in these + * structs. + * + * We allocate these with bio_alloc_bioset, using the front_pad parameter when + * the bioset is created - this means the bio has to come at the end of the + * struct. */ struct dm_rq_clone_bio_info { struct bio *orig; struct dm_rq_target_io *tio; + struct bio clone; }; union map_info *dm_get_mapinfo(struct bio *bio) @@ -209,8 +215,12 @@ struct dm_md_mempools { #define MIN_IOS 256 static struct kmem_cache *_io_cache; -static struct kmem_cache *_tio_cache; static struct kmem_cache *_rq_tio_cache; + +/* + * Unused now, and needs to be deleted. But since io_pool is overloaded and it's + * still used for _io_cache, I'm leaving this for a later cleanup + */ static struct kmem_cache *_rq_bio_info_cache; static int __init local_init(void) @@ -222,14 +232,9 @@ static int __init local_init(void) if (!_io_cache) return r; - /* allocate a slab for the target ios */ - _tio_cache = KMEM_CACHE(dm_target_io, 0); - if (!_tio_cache) - goto out_free_io_cache; - _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); if (!_rq_tio_cache) - goto out_free_tio_cache; + goto out_free_io_cache; _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); if (!_rq_bio_info_cache) @@ -255,8 +260,6 @@ out_free_rq_bio_info_cache: kmem_cache_destroy(_rq_bio_info_cache); out_free_rq_tio_cache: kmem_cache_destroy(_rq_tio_cache); -out_free_tio_cache: - kmem_cache_destroy(_tio_cache); out_free_io_cache: kmem_cache_destroy(_io_cache); @@ -267,7 +270,6 @@ static void local_exit(void) { kmem_cache_destroy(_rq_bio_info_cache); kmem_cache_destroy(_rq_tio_cache); - kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); dm_uevent_exit(); @@ -453,7 +455,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io) static void free_tio(struct mapped_device *md, struct dm_target_io *tio) { - mempool_free(tio, md->tio_pool); + bio_put(&tio->clone); } static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, @@ -467,16 +469,6 @@ static void free_rq_tio(struct dm_rq_target_io *tio) mempool_free(tio, tio->md->tio_pool); } -static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) -{ - return mempool_alloc(md->io_pool, GFP_ATOMIC); -} - -static void free_bio_info(struct dm_rq_clone_bio_info *info) -{ - mempool_free(info, info->tio->md->io_pool); -} - static int md_in_flight(struct mapped_device *md) { return atomic_read(&md->pending[READ]) + @@ -681,13 +673,7 @@ static void clone_endio(struct bio *bio, int error) } } - /* - * Store md for cleanup instead of tio which is about to get freed. - */ - bio->bi_private = md->bs; - free_tio(md, tio); - bio_put(bio); dec_pending(io, error); } @@ -1007,12 +993,12 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) } EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); -static void __map_bio(struct dm_target *ti, struct bio *clone, - struct dm_target_io *tio) +static void __map_bio(struct dm_target *ti, struct dm_target_io *tio) { int r; sector_t sector; struct mapped_device *md; + struct bio *clone = &tio->clone; clone->bi_end_io = clone_endio; clone->bi_private = tio; @@ -1036,12 +1022,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* error the io and bail out, or requeue it if needed */ md = tio->io->md; dec_pending(tio->io, r); - /* - * Store bio_set for cleanup. - */ - clone->bi_end_io = NULL; - clone->bi_private = md->bs; - bio_put(clone); free_tio(md, tio); } else if (r) { DMWARN("unimplemented target map return value: %d", r); @@ -1059,25 +1039,16 @@ struct clone_info { unsigned short idx; }; -static void dm_bio_destructor(struct bio *bio) -{ - struct bio_set *bs = bio->bi_private; - - bio_free(bio, bs); -} - /* * Creates a little bio that just does part of a bvec. */ -static struct bio *split_bvec(struct bio *bio, sector_t sector, - unsigned short idx, unsigned int offset, - unsigned int len, struct bio_set *bs) +static void split_bvec(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned short idx, unsigned int offset, + unsigned int len, struct bio_set *bs) { - struct bio *clone; + struct bio *clone = &tio->clone; struct bio_vec *bv = bio->bi_io_vec + idx; - clone = bio_alloc_bioset(GFP_NOIO, 1, bs); - clone->bi_destructor = dm_bio_destructor; *clone->bi_io_vec = *bv; clone->bi_sector = sector; @@ -1090,26 +1061,23 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_flags |= 1 << BIO_CLONED; if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO, bs); + bio_integrity_clone(clone, bio, GFP_NOIO); bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); } - - return clone; } /* * Creates a bio that consists of range of complete bvecs. */ -static struct bio *clone_bio(struct bio *bio, sector_t sector, - unsigned short idx, unsigned short bv_count, - unsigned int len, struct bio_set *bs) +static void clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned short idx, + unsigned short bv_count, unsigned int len, + struct bio_set *bs) { - struct bio *clone; + struct bio *clone = &tio->clone; - clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); __bio_clone(clone, bio); - clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; clone->bi_vcnt = idx + bv_count; @@ -1117,20 +1085,22 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone->bi_flags &= ~(1 << BIO_SEG_VALID); if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO, bs); + bio_integrity_clone(clone, bio, GFP_NOIO); if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) bio_integrity_trim(clone, bio_sector_offset(bio, idx, 0), len); } - - return clone; } static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti) + struct dm_target *ti, int nr_iovecs) { - struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); + struct dm_target_io *tio; + struct bio *clone; + + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); + tio = container_of(clone, struct dm_target_io, clone); tio->io = ci->io; tio->ti = ti; @@ -1142,8 +1112,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, unsigned request_nr, sector_t len) { - struct dm_target_io *tio = alloc_tio(ci, ti); - struct bio *clone; + struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs); + struct bio *clone = &tio->clone; tio->info.target_request_nr = request_nr; @@ -1152,15 +1122,14 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush * and discard, so no need for concern about wasted bvec allocations. */ - clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); - __bio_clone(clone, ci->bio); - clone->bi_destructor = dm_bio_destructor; + + __bio_clone(clone, ci->bio); if (len) { clone->bi_sector = ci->sector; clone->bi_size = to_bytes(len); } - __map_bio(ti, clone, tio); + __map_bio(ti, tio); } static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, @@ -1189,14 +1158,13 @@ static int __clone_and_map_empty_flush(struct clone_info *ci) */ static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) { - struct bio *clone, *bio = ci->bio; + struct bio *bio = ci->bio; struct dm_target_io *tio; - tio = alloc_tio(ci, ti); - clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count, - ci->md->bs); - __map_bio(ti, clone, tio); + tio = alloc_tio(ci, ti, bio->bi_max_vecs); + clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, + ci->sector_count, ci->md->bs); + __map_bio(ti, tio); ci->sector_count = 0; } @@ -1234,7 +1202,7 @@ static int __clone_and_map_discard(struct clone_info *ci) static int __clone_and_map(struct clone_info *ci) { - struct bio *clone, *bio = ci->bio; + struct bio *bio = ci->bio; struct dm_target *ti; sector_t len = 0, max; struct dm_target_io *tio; @@ -1274,10 +1242,10 @@ static int __clone_and_map(struct clone_info *ci) len += bv_len; } - tio = alloc_tio(ci, ti); - clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, - ci->md->bs); - __map_bio(ti, clone, tio); + tio = alloc_tio(ci, ti, bio->bi_max_vecs); + clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len, + ci->md->bs); + __map_bio(ti, tio); ci->sector += len; ci->sector_count -= len; @@ -1302,12 +1270,11 @@ static int __clone_and_map(struct clone_info *ci) len = min(remaining, max); - tio = alloc_tio(ci, ti); - clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset + offset, len, - ci->md->bs); + tio = alloc_tio(ci, ti, 1); + split_bvec(tio, bio, ci->sector, ci->idx, + bv->bv_offset + offset, len, ci->md->bs); - __map_bio(ti, clone, tio); + __map_bio(ti, tio); ci->sector += len; ci->sector_count -= len; @@ -1484,30 +1451,17 @@ void dm_dispatch_request(struct request *rq) } EXPORT_SYMBOL_GPL(dm_dispatch_request); -static void dm_rq_bio_destructor(struct bio *bio) -{ - struct dm_rq_clone_bio_info *info = bio->bi_private; - struct mapped_device *md = info->tio->md; - - free_bio_info(info); - bio_free(bio, md->bs); -} - static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, void *data) { struct dm_rq_target_io *tio = data; - struct mapped_device *md = tio->md; - struct dm_rq_clone_bio_info *info = alloc_bio_info(md); - - if (!info) - return -ENOMEM; + struct dm_rq_clone_bio_info *info = + container_of(bio, struct dm_rq_clone_bio_info, clone); info->orig = bio_orig; info->tio = tio; bio->bi_end_io = end_clone_bio; bio->bi_private = info; - bio->bi_destructor = dm_rq_bio_destructor; return 0; } @@ -1988,7 +1942,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) { struct dm_md_mempools *p; - if (md->io_pool && md->tio_pool && md->bs) + if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) /* the md already has necessary mempools */ goto out; @@ -2765,13 +2719,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) if (!pools->io_pool) goto free_pools_and_out; - pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? - mempool_create_slab_pool(MIN_IOS, _tio_cache) : - mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); - if (!pools->tio_pool) - goto free_io_pool_and_out; + pools->tio_pool = NULL; + if (type == DM_TYPE_REQUEST_BASED) { + pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); + if (!pools->tio_pool) + goto free_io_pool_and_out; + } - pools->bs = bioset_create(pool_size, 0); + pools->bs = (type == DM_TYPE_BIO_BASED) ? + bioset_create(pool_size, + offsetof(struct dm_target_io, clone)) : + bioset_create(pool_size, + offsetof(struct dm_rq_clone_bio_info, clone)); if (!pools->bs) goto free_tio_pool_and_out; @@ -2784,7 +2743,8 @@ free_bioset_and_out: bioset_free(pools->bs); free_tio_pool_and_out: - mempool_destroy(pools->tio_pool); + if (pools->tio_pool) + mempool_destroy(pools->tio_pool); free_io_pool_and_out: mempool_destroy(pools->io_pool); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fa211d80fc0..21014836bdb 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) struct linear_conf *conf; struct md_rdev *rdev; int i, cnt; + bool discard_supported = false; conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), GFP_KERNEL); @@ -171,6 +172,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->array_sectors += rdev->sectors; cnt++; + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } if (cnt != raid_disks) { printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", @@ -178,6 +181,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) goto out; } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* * Here we calculate the device offsets. */ @@ -244,7 +252,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference(mddev->private); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); mddev->raid_disks++; rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); @@ -256,7 +266,10 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) static int linear_stop (struct mddev *mddev) { - struct linear_conf *conf = mddev->private; + struct linear_conf *conf = + rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); /* * We do not require rcu protection here since @@ -326,6 +339,14 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; rcu_read_unlock(); + + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 308e87b417e..9ab768acfb6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -155,32 +155,17 @@ static int start_readonly; * like bio_clone, but with a local bio set */ -static void mddev_bio_destructor(struct bio *bio) -{ - struct mddev *mddev, **mddevp; - - mddevp = (void*)bio; - mddev = mddevp[-1]; - - bio_free(bio, mddev->bio_set); -} - struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev) { struct bio *b; - struct mddev **mddevp; if (!mddev || !mddev->bio_set) return bio_alloc(gfp_mask, nr_iovecs); - b = bio_alloc_bioset(gfp_mask, nr_iovecs, - mddev->bio_set); + b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); if (!b) return NULL; - mddevp = (void*)b; - mddevp[-1] = mddev; - b->bi_destructor = mddev_bio_destructor; return b; } EXPORT_SYMBOL_GPL(bio_alloc_mddev); @@ -188,32 +173,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev); struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev) { - struct bio *b; - struct mddev **mddevp; - if (!mddev || !mddev->bio_set) return bio_clone(bio, gfp_mask); - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, - mddev->bio_set); - if (!b) - return NULL; - mddevp = (void*)b; - mddevp[-1] = mddev; - b->bi_destructor = mddev_bio_destructor; - __bio_clone(b, bio); - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; + return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); } EXPORT_SYMBOL_GPL(bio_clone_mddev); @@ -711,7 +674,18 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) return NULL; } -static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -722,6 +696,17 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) return NULL; } +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; @@ -2059,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; @@ -3791,6 +3782,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = @@ -4268,6 +4261,13 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); @@ -4278,7 +4278,8 @@ static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_scan_mode = @@ -4399,6 +4400,10 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; @@ -5006,8 +5011,7 @@ int md_run(struct mddev *mddev) } if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, - sizeof(struct mddev *)); + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5245,7 +5249,7 @@ static void md_clean(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -5547,8 +5551,9 @@ static int get_array_info(struct mddev * mddev, void __user * arg) int nr,working,insync,failed,spare; struct md_rdev *rdev; - nr=working=insync=failed=spare=0; - rdev_for_each(rdev, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5560,6 +5565,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; @@ -5643,7 +5649,8 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5662,6 +5669,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } + rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -6270,18 +6278,22 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) static int set_disk_faulty(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; + int err = 0; if (mddev->pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - return -EBUSY; - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -6353,6 +6365,27 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -6425,18 +6458,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, */ switch (cmd) { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); goto done_unlock; - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto done_unlock; @@ -6518,10 +6543,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; - case RUN_ARRAY: err = do_md_run(mddev); goto done_unlock; @@ -6679,7 +6700,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6694,8 +6715,8 @@ void md_wakeup_thread(struct md_thread *thread) } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -6790,7 +6811,11 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) int scale; unsigned int per_milli; - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) @@ -7016,7 +7041,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->curr_resync > 2) { status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -7244,8 +7269,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; @@ -7349,7 +7375,7 @@ void md_do_sync(struct mddev *mddev) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; @@ -7405,8 +7431,11 @@ void md_do_sync(struct mddev *mddev) "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); blk_start_plug(&plug); while (j < max_sectors) { @@ -7459,7 +7488,8 @@ void md_do_sync(struct mddev *mddev) break; j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7581,8 +7611,6 @@ static int remove_and_add_spares(struct mddev *mddev) int spares = 0; int removed = 0; - mddev->curr_resync_completed = 0; - rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && @@ -7777,6 +7805,7 @@ void md_check_recovery(struct mddev *mddev) /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set @@ -7790,7 +7819,7 @@ void md_check_recovery(struct mddev *mddev) /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */ diff --git a/drivers/md/md.h b/drivers/md/md.h index f385b038589..af443ab868d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -282,7 +282,7 @@ struct mddev { sector_t resync_max_sectors; /* may be set by personality */ - sector_t resync_mismatches; /* count of sectors where + atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ @@ -540,12 +540,13 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { - void (*run) (struct mddev *mddev); + void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; + void *private; }; #define THREAD_WAKEUP 0 @@ -584,7 +585,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), + void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct md_thread **threadp); @@ -603,7 +604,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); +extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); extern int md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 61a1833ebaf..1642eae75a3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -335,8 +335,9 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (struct mddev *mddev) +static void multipathd(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index d77602d63c8..f3a9af8cdec 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -434,14 +434,14 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, if (ref_count && !old) { *ev = SM_ALLOC; ll->nr_allocated++; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1); + le32_add_cpu(&ie_disk.nr_free, -1); if (le32_to_cpu(ie_disk.none_free_before) == bit) ie_disk.none_free_before = cpu_to_le32(bit + 1); } else if (old && !ref_count) { *ev = SM_FREE; ll->nr_allocated--; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1); + le32_add_cpu(&ie_disk.nr_free, 1); ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index de63a1fc373..24b359717a7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + bool discard_supported = false; if (!conf) return -ENOMEM; @@ -195,6 +196,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) + discard_supported = true; } if (cnt != mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " @@ -272,6 +276,11 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -422,6 +431,8 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { @@ -509,7 +520,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -535,6 +546,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = sector_offset + zone->dev_start + tmp_dev->data_offset; + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); return; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 611b5f79761..8034fbd6190 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -333,9 +333,10 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_unlock_irqrestore(&conf->device_lock, flags); } - if (uptodate) + if (uptodate) { raid_end_bio_io(r1_bio); - else { + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + } else { /* * oops, read error: */ @@ -349,9 +350,8 @@ static void raid1_end_read_request(struct bio *bio, int error) (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); + /* don't drop the reference on read_disk yet */ } - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } static void close_write(struct r1bio *r1_bio) @@ -781,7 +781,12 @@ static void flush_pending_writes(struct r1conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -994,6 +999,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1295,7 +1302,7 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -1549,6 +1556,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } @@ -1867,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) } else j = 0; if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { /* No need to write to this device. */ @@ -2220,6 +2229,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) unfreeze_array(conf); } else md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); bio = r1_bio->bios[r1_bio->read_disk]; bdevname(bio->bi_bdev, b); @@ -2285,8 +2295,9 @@ read_more: } } -static void raid1d(struct mddev *mddev) +static void raid1d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r1bio *r1_bio; unsigned long flags; struct r1conf *conf = mddev->private; @@ -2783,6 +2794,7 @@ static int run(struct mddev *mddev) int i; struct md_rdev *rdev; int ret; + bool discard_supported = false; if (mddev->level != 1) { printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", @@ -2812,6 +2824,8 @@ static int run(struct mddev *mddev) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } mddev->degraded = 0; @@ -2846,6 +2860,13 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); + + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } ret = md_integrity_register(mddev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0138a727c1f..906ccbd0f7d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -1050,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; @@ -1061,8 +1104,12 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid10_plug_cb *plug = NULL; int sectors_handled; int max_sectors; int sectors; @@ -1081,7 +1128,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -1410,15 +1457,26 @@ retry_write: conf->mirrors[d].rdev)); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) @@ -1439,7 +1497,7 @@ retry_write: conf->mirrors[d].replacement)); mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1638,7 +1696,7 @@ static int raid10_spare_active(struct mddev *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -1725,6 +1783,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + print_conf(conf); return err; } @@ -1952,7 +2013,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) break; if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; @@ -2673,8 +2734,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } } -static void raid10d(struct mddev *mddev) +static void raid10d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r10bio *r10_bio; unsigned long flags; struct r10conf *conf = mddev->private; @@ -3158,7 +3220,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, else { bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) - max_sync = max_sync; + max_sync = bad_sectors; continue; } } @@ -3482,6 +3544,7 @@ static int run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + bool discard_supported = false; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -3498,6 +3561,8 @@ static int run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3543,8 +3608,16 @@ static int run(struct mddev *mddev) rdev->data_offset << 9); disk->head_position = 0; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0689173fd9f..c5439dce029 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -551,6 +551,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; + if (test_bit(R5_Discard, &sh->dev[i].flags)) + rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else if (test_and_clear_bit(R5_WantReplace, @@ -1174,8 +1176,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + if (wbi->bi_rw & REQ_DISCARD) + set_bit(R5_Discard, &dev->flags); + else + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); } } @@ -1191,7 +1196,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false, sync = false; + bool fua = false, sync = false, discard = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1199,13 +1204,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - set_bit(R5_UPTODATE, &dev->flags); + if (!discard) + set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -1241,6 +1248,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ @@ -1285,10 +1304,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks = percpu->scribble; - int count; + int count, i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + count = set_syndrome_sources(blocks, sh); atomic_inc(&sh->count); @@ -2408,11 +2441,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&sh->stripe_lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, (unsigned long long)sh->sector, dd_idx); + spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { bitmap_startwrite(conf->mddev->bitmap, sh->sector, @@ -2479,10 +2512,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; spin_unlock_irq(&sh->stripe_lock); - if (bi) { - s->to_write--; + if (bi) bitmap_end = 1; - } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -2524,11 +2555,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - if (bi) s->to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = @@ -2741,7 +2773,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_and_clear_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); @@ -2775,12 +2808,25 @@ static void handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* RAID6 requires 'rcw' in current implementation. + * Otherwise, check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->max_degraded == 2 || + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->max_degraded, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -2932,7 +2978,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -3084,7 +3130,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -3459,10 +3505,12 @@ static void handle_stripe(struct stripe_head *sh) if (s.written && (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); /* Now we might consider reading some blocks, either to check/generate @@ -3489,9 +3537,11 @@ static void handle_stripe(struct stripe_head *sh) /* All the 'written' buffers and the parity block are ready to * be written back to disk */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && @@ -4072,6 +4122,88 @@ static void release_stripe_plug(struct mddev *mddev, release_stripe(sh); } +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int remaining; + int stripe_sectors; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int d; + again: + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + schedule(); + goto again; + } + } + finish_wait(&conf->wait_for_overlap, &w); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + raid5_inc_bi_active_stripes(bi); + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + remaining = raid5_dec_bi_active_stripes(bi); + if (remaining == 0) { + md_write_end(mddev); + bio_endio(bi, 0); + } +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4094,6 +4226,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) chunk_aligned_read(mddev,bi)) return; + if (unlikely(bi->bi_rw & REQ_DISCARD)) { + make_discard_request(mddev, bi); + return; + } + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -4630,8 +4767,9 @@ static int handle_active_stripes(struct r5conf *conf) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; @@ -5366,6 +5504,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { int chunk_size; + bool discard_supported = true; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -5385,13 +5524,48 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + mddev->queue->limits.discard_alignment = stripe; + mddev->queue->limits.discard_granularity = stripe; + /* + * unaligned part of discard request will be ignored, so can't + * guarantee discard_zerors_data + */ + mddev->queue->limits.discard_zeroes_data = 0; rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->new_data_offset << 9); + /* + * discard_zeroes_data is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + */ + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || + !bdev_get_queue(rdev->bdev)-> + limits.discard_zeroes_data) + discard_supported = false; } + + if (discard_supported && + mddev->queue->limits.max_discard_sectors >= stripe && + mddev->queue->limits.discard_granularity >= stripe) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } return 0; @@ -5702,7 +5876,8 @@ static int check_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); + return resize_stripes(conf, (conf->previous_raid_disks + + mddev->delta_disks)); } static int raid5_start_reshape(struct mddev *mddev) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901ed..18b2c4a8a1f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -298,6 +298,7 @@ enum r5dev_flags { R5_WantReplace, /* We need to update the replacement, we have read * data in, and now is a good time to write it out. */ + R5_Discard, /* Discard the stripe */ }; /* |