diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/bcache.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/closure.h | 2 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 61 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 12 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 106 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 15 | ||||
-rw-r--r-- | drivers/md/dm.c | 3 | ||||
-rw-r--r-- | drivers/md/md.c | 20 | ||||
-rw-r--r-- | drivers/md/raid10.c | 13 | ||||
-rw-r--r-- | drivers/md/raid5.c | 163 | ||||
-rw-r--r-- | drivers/md/raid5.h | 4 |
15 files changed, 292 insertions, 130 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 82c9c5d3525..d2ebcf32309 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -828,7 +828,7 @@ static inline bool cached_dev_get(struct cached_dev *dc) return false; /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return true; } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 7ef7461912b..a08e3eeac3c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, cl->fn = fn; cl->wq = wq; /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } static inline void closure_queue(struct closure *cl) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 9a8e66ae04f..67f8b31e205 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -669,17 +669,13 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store, /* * return a pointer to the page in the filemap that contains the given bit * - * this lookup is complicated by the fact that the bitmap sb might be exactly - * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page - * 0 or page 1 */ static inline struct page *filemap_get_page(struct bitmap_storage *store, unsigned long chunk) { if (file_page_index(store, chunk) >= store->file_pages) return NULL; - return store->filemap[file_page_index(store, chunk) - - file_page_index(store, 0)]; + return store->filemap[file_page_index(store, chunk)]; } static int bitmap_storage_alloc(struct bitmap_storage *store, diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 66c5d130c8c..4e84095833d 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -607,9 +607,9 @@ static void write_endio(struct bio *bio, int error) BUG_ON(!test_bit(B_WRITING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_WRITING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_WRITING); } @@ -997,9 +997,9 @@ static void read_endio(struct bio *bio, int error) BUG_ON(!test_bit(B_READING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_READING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_READING); } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1bf4a71919e..5f054c44b48 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2178,6 +2178,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; ti->discard_zeroes_data_unsupported = true; + /* Discard bios must be split on a block boundary */ + ti->split_discard_bios = true; cache->features = ca->features; ti->per_bio_data_size = get_per_bio_data_size(cache); @@ -2488,6 +2490,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) } else { inc_hit_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 784695d22fd..53b213226c0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -19,7 +19,6 @@ #include <linux/crypto.h> #include <linux/workqueue.h> #include <linux/backing-dev.h> -#include <linux/percpu.h> #include <linux/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> @@ -43,6 +42,7 @@ struct convert_context { struct bvec_iter iter_out; sector_t cc_sector; atomic_t cc_pending; + struct ablkcipher_request *req; }; /* @@ -111,15 +111,7 @@ struct iv_tcw_private { enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; /* - * Duplicated per-CPU state for cipher. - */ -struct crypt_cpu { - struct ablkcipher_request *req; -}; - -/* - * The fields in here must be read only after initialization, - * changing state should be in crypt_cpu. + * The fields in here must be read only after initialization. */ struct crypt_config { struct dm_dev *dev; @@ -150,12 +142,6 @@ struct crypt_config { sector_t iv_offset; unsigned int iv_size; - /* - * Duplicated per cpu state. Access through - * per_cpu_ptr() only. - */ - struct crypt_cpu __percpu *cpu; - /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; struct crypto_ablkcipher **tfms; @@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); -static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) -{ - return this_cpu_ptr(cc->cpu); -} - /* * Use this to access cipher attributes that are the same for each CPU. */ @@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - struct crypt_cpu *this_cc = this_crypt_config(cc); unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); - if (!this_cc->req) - this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + if (!ctx->req) + ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]); - ablkcipher_request_set_callback(this_cc->req, + ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + ablkcipher_request_set_callback(ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); + kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } /* @@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { - struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->cc_pending, 1); @@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->cc_pending); - r = crypt_convert_block(cc, ctx, this_cc->req); + r = crypt_convert_block(cc, ctx, ctx->req); switch (r) { /* async */ @@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc, reinit_completion(&ctx->restart); /* fall through*/ case -EINPROGRESS: - this_cc->req = NULL; + ctx->req = NULL; ctx->cc_sector++; continue; @@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, io->sector = sector; io->error = 0; io->base_io = NULL; + io->ctx.req = NULL; atomic_set(&io->io_pending, 0); return io; @@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (!atomic_dec_and_test(&io->io_pending)) return; + if (io->ctx.req) + mempool_free(io->ctx.req, cc->req_pool); mempool_free(io, cc->io_pool); if (likely(!base_io)) @@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc) static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; - struct crypt_cpu *cpu_cc; - int cpu; ti->private = NULL; @@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); - if (cc->cpu) - for_each_possible_cpu(cpu) { - cpu_cc = per_cpu_ptr(cc->cpu, cpu); - if (cpu_cc->req) - mempool_free(cpu_cc->req, cc->req_pool); - } - crypt_free_tfms(cc); if (cc->bs) @@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->dev) dm_put_device(ti, cc->dev); - if (cc->cpu) - free_percpu(cc->cpu); - kzfree(cc->cipher); kzfree(cc->cipher_string); @@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)), - __alignof__(struct crypt_cpu)); - if (!cc->cpu) { - ti->error = "Cannot allocate per cpu state"; - goto bad_mem; - } - /* * For compatibility with the original dm-crypt mapping format, if * only the cipher name is supplied, use cbc-plain. diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa009e86587..ebfa411d1a7 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -445,11 +445,11 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, else m->saved_queue_if_no_path = queue_if_no_path; m->queue_if_no_path = queue_if_no_path; - if (!m->queue_if_no_path) - dm_table_run_md_queue_async(m->ti->table); - spin_unlock_irqrestore(&m->lock, flags); + if (!queue_if_no_path) + dm_table_run_md_queue_async(m->ti->table); + return 0; } @@ -954,7 +954,7 @@ out: */ static int reinstate_path(struct pgpath *pgpath) { - int r = 0; + int r = 0, run_queue = 0; unsigned long flags; struct multipath *m = pgpath->pg->m; @@ -978,7 +978,7 @@ static int reinstate_path(struct pgpath *pgpath) if (!m->nr_valid_paths++) { m->current_pgpath = NULL; - dm_table_run_md_queue_async(m->ti->table); + run_queue = 1; } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) m->pg_init_in_progress++; @@ -991,6 +991,8 @@ static int reinstate_path(struct pgpath *pgpath) out: spin_unlock_irqrestore(&m->lock, flags); + if (run_queue) + dm_table_run_md_queue_async(m->ti->table); return r; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ebddef5237e..8e0caed0bf7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -642,7 +642,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) struct dm_snapshot *s = pe->snap; mempool_free(pe, s->pending_pool); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&s->pending_exceptions_count); } @@ -783,7 +783,7 @@ static int init_hash_tables(struct dm_snapshot *s) static void merge_shutdown(struct dm_snapshot *s) { clear_bit_unlock(RUNNING_MERGE, &s->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&s->state_bits, RUNNING_MERGE); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 53728be84de..242ac2ea5f2 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -27,6 +27,9 @@ #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ +#define NO_SPACE_TIMEOUT_SECS 60 + +static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS; DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write"); @@ -175,6 +178,7 @@ struct pool { struct workqueue_struct *wq; struct work_struct worker; struct delayed_work waker; + struct delayed_work no_space_timeout; unsigned long last_commit_jiffies; unsigned ref_count; @@ -232,6 +236,13 @@ struct thin_c { struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ + + /* + * Ensures the thin is not destroyed until the worker has finished + * iterating the active_thins list. + */ + atomic_t refcount; + struct completion can_destroy; }; /*----------------------------------------------------------------*/ @@ -928,7 +939,7 @@ static int commit(struct pool *pool) { int r; - if (get_pool_mode(pool) != PM_WRITE) + if (get_pool_mode(pool) >= PM_READ_ONLY) return -EINVAL; r = dm_pool_commit_metadata(pool->pmd); @@ -1486,6 +1497,45 @@ static void process_thin_deferred_bios(struct thin_c *tc) blk_finish_plug(&plug); } +static void thin_get(struct thin_c *tc); +static void thin_put(struct thin_c *tc); + +/* + * We can't hold rcu_read_lock() around code that can block. So we + * find a thin with the rcu lock held; bump a refcount; then drop + * the lock. + */ +static struct thin_c *get_first_thin(struct pool *pool) +{ + struct thin_c *tc = NULL; + + rcu_read_lock(); + if (!list_empty(&pool->active_thins)) { + tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + thin_get(tc); + } + rcu_read_unlock(); + + return tc; +} + +static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) +{ + struct thin_c *old_tc = tc; + + rcu_read_lock(); + list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) { + thin_get(tc); + thin_put(old_tc); + rcu_read_unlock(); + return tc; + } + thin_put(old_tc); + rcu_read_unlock(); + + return NULL; +} + static void process_deferred_bios(struct pool *pool) { unsigned long flags; @@ -1493,10 +1543,11 @@ static void process_deferred_bios(struct pool *pool) struct bio_list bios; struct thin_c *tc; - rcu_read_lock(); - list_for_each_entry_rcu(tc, &pool->active_thins, list) + tc = get_first_thin(pool); + while (tc) { process_thin_deferred_bios(tc); - rcu_read_unlock(); + tc = get_next_thin(pool, tc); + } /* * If there are any deferred flush bios, we must commit @@ -1543,6 +1594,20 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } +/* + * We're holding onto IO to allow userland time to react. After the + * timeout either the pool will have been resized (and thus back in + * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + */ +static void do_no_space_timeout(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, + no_space_timeout); + + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) + set_pool_mode(pool, PM_READ_ONLY); +} + /*----------------------------------------------------------------*/ struct noflush_work { @@ -1578,7 +1643,7 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) { struct noflush_work w; - INIT_WORK(&w.worker, fn); + INIT_WORK_ONSTACK(&w.worker, fn); w.tc = tc; atomic_set(&w.complete, 0); init_waitqueue_head(&w.wait); @@ -1607,6 +1672,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) struct pool_c *pt = pool->ti->private; bool needs_check = dm_pool_metadata_needs_check(pool->pmd); enum pool_mode old_mode = get_pool_mode(pool); + unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; /* * Never allow the pool to transition to PM_WRITE mode if user @@ -1668,6 +1734,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; + + if (!pool->pf.error_if_no_space && no_space_timeout) + queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); break; case PM_WRITE: @@ -2053,6 +2122,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); + INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); @@ -2615,6 +2685,7 @@ static void pool_postsuspend(struct dm_target *ti) struct pool *pool = pt->pool; cancel_delayed_work(&pool->waker); + cancel_delayed_work(&pool->no_space_timeout); flush_workqueue(pool->wq); (void) commit(pool); } @@ -3061,11 +3132,25 @@ static struct target_type pool_target = { /*---------------------------------------------------------------- * Thin target methods *--------------------------------------------------------------*/ +static void thin_get(struct thin_c *tc) +{ + atomic_inc(&tc->refcount); +} + +static void thin_put(struct thin_c *tc) +{ + if (atomic_dec_and_test(&tc->refcount)) + complete(&tc->can_destroy); +} + static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; unsigned long flags; + thin_put(tc); + wait_for_completion(&tc->can_destroy); + spin_lock_irqsave(&tc->pool->lock, flags); list_del_rcu(&tc->list); spin_unlock_irqrestore(&tc->pool->lock, flags); @@ -3101,6 +3186,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; + unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -3191,9 +3277,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock(&tc->pool->lock); + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + + spin_lock_irqsave(&tc->pool->lock, flags); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock(&tc->pool->lock); + spin_unlock_irqrestore(&tc->pool->lock, flags); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly @@ -3422,6 +3511,9 @@ static void dm_thin_exit(void) module_init(dm_thin_init); module_exit(dm_thin_exit); +module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds"); + MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 796007a5e0e..7a7bab8947a 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -330,15 +330,17 @@ test_block_hash: return r; } } - todo = 1 << v->data_dev_block_bits; - while (io->iter.bi_size) { + do { u8 *page; + unsigned len; struct bio_vec bv = bio_iter_iovec(bio, io->iter); page = kmap_atomic(bv.bv_page); - r = crypto_shash_update(desc, page + bv.bv_offset, - bv.bv_len); + len = bv.bv_len; + if (likely(len >= todo)) + len = todo; + r = crypto_shash_update(desc, page + bv.bv_offset, len); kunmap_atomic(page); if (r < 0) { @@ -346,8 +348,9 @@ test_block_hash: return r; } - bio_advance_iter(bio, &io->iter, bv.bv_len); - } + bio_advance_iter(bio, &io->iter, len); + todo -= len; + } while (todo); if (!v->version) { r = crypto_shash_update(desc, v->salt, v->salt_size); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 455e6491649..aa9e093343d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq, clone->cmd = rq->cmd; clone->cmd_len = rq->cmd_len; clone->sense = rq->sense; - clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -2447,7 +2446,7 @@ static void dm_wq_work(struct work_struct *work) static void dm_queue_flush(struct mapped_device *md) { clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); queue_work(md->wq, &md->work); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 8fda38d23e3..34846856dbc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3448,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->level = LEVEL_NONE; return rv; } + if (mddev->ro) + return -EROFS; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3634,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) { @@ -3723,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) { @@ -6135,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) */ if (mddev->sync_thread) return -EBUSY; + if (mddev->ro) + return -EROFS; rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; @@ -6157,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (mddev->ro) + return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; @@ -7381,8 +7391,10 @@ void md_do_sync(struct md_thread *thread) /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ + if (mddev->ro) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; + } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { @@ -7824,6 +7836,7 @@ void md_check_recovery(struct mddev *mddev) /* There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; @@ -8330,7 +8343,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) if (a < s) { /* we need to split this range */ if (bb->count >= MD_MAX_BADBLOCKS) { - rv = 0; + rv = -ENOSPC; goto out; } memmove(p+lo+1, p+lo, (bb->count - lo) * 8); @@ -8516,7 +8529,8 @@ static int md_notify_reboot(struct notifier_block *this, if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); - mddev->safemode = 2; + if (mddev->persistent) + mddev->safemode = 2; mddev_unlock(mddev); } need_delay = 1; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 33fc408e5ea..cb882aae9e2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1172,6 +1172,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio) int max_sectors; int sectors; + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + wait_barrier(conf); + sectors = bio_sectors(bio); while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && @@ -1552,12 +1559,6 @@ static void make_request(struct mddev *mddev, struct bio *bio) md_write_start(mddev, bio); - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - wait_barrier(conf); do { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 25247a85291..6234b2e8458 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + if (atomic_read(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh) int hash; bool wakeup; + /* Avoid release_list until the last reference. + */ + if (atomic_add_unless(&sh->count, -1, 1)) + return; + if (unlikely(!conf->mddev->thread) || test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; @@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh) int num = sh->raid_conf->pool_size; for (i = 0; i < num ; i++) { + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) continue; @@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh) return 1; } sh->dev[i].page = page; + sh->dev[i].orig_page = page; } return 0; } @@ -855,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_NOMERGE; + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; @@ -899,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_iter.bi_sector = (sh->sector + rrdev->data_offset); + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; @@ -927,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, - sector_t sector, struct dma_async_tx_descriptor *tx) +async_copy_data(int frombio, struct bio *bio, struct page **page, + sector_t sector, struct dma_async_tx_descriptor *tx, + struct stripe_head *sh) { struct bio_vec bvl; struct bvec_iter iter; @@ -965,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, if (clen > 0) { b_offset += bvl.bv_offset; bio_page = bvl.bv_page; - if (frombio) - tx = async_memcpy(page, bio_page, page_offset, + if (frombio) { + if (sh->raid_conf->skip_copy && + b_offset == 0 && page_offset == 0 && + clen == STRIPE_SIZE) + *page = bio_page; + else + tx = async_memcpy(*page, bio_page, page_offset, b_offset, clen, &submit); - else - tx = async_memcpy(bio_page, page, b_offset, + } else + tx = async_memcpy(bio_page, *page, b_offset, page_offset, clen, &submit); } /* chain the operations */ @@ -1045,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh) spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(0, rbi, dev->page, - dev->sector, tx); + tx = async_copy_data(0, rbi, &dev->page, + dev->sector, tx, sh); rbi = r5_next_bio(rbi, dev->sector); } } @@ -1384,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); + WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1393,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_SyncIO, &dev->flags); if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - else - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + else { + tx = async_copy_data(1, wbi, &dev->page, + dev->sector, tx, sh); + if (dev->page != dev->orig_page) { + set_bit(R5_SkipCopy, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + clear_bit(R5_OVERWRITE, &dev->flags); + } + } wbi = r5_next_bio(wbi, dev->sector); } } @@ -1426,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); @@ -1839,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); - for(i=0; i<conf->pool_size; i++) + for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; + nsh->dev[i].orig_page = osh->dev[i].page; + } for( ; i<newsize; i++) nsh->dev[i].page = NULL; nsh->hash_lock_index = hash; @@ -1896,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; + nsh->dev[i].orig_page = p; if (!p) err = -ENOMEM; } @@ -2133,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error) } static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - + static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; + dev->req.bi_max_vecs = 1; dev->req.bi_private = sh; - dev->vec.bv_page = dev->page; bio_init(&dev->rreq); dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_vcnt++; - dev->rreq.bi_max_vecs++; + dev->rreq.bi_max_vecs = 1; dev->rreq.bi_private = sh; - dev->rvec.bv_page = dev->page; dev->flags = 0; dev->sector = compute_blocknr(sh, i, previous); @@ -2750,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].page = sh->dev[i].orig_page; + } + if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { @@ -2886,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (s->failed >= 1 && fdev[0]->toread) || (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + (sh->raid_conf->level == 6 && s->failed && s->to_write && + s->to_write < sh->raid_conf->raid_disks - 2 && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -2991,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags) || + test_bit(R5_SkipCopy, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); if (test_and_clear_bit(R5_Discard, &dev->flags)) clear_bit(R5_UPTODATE, &dev->flags); + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); + dev->page = dev->orig_page; + } wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < @@ -3015,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, 0); } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; + WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); + WARN_ON(dev->page != dev->orig_page); } if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -3086,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; + if (test_bit(R5_Insync, &dev->flags)) + rcw++; else rcw += 2*disks; } @@ -3107,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for r-m-w\n", i); + if (test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block %d for r-m-w\n", + i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -3133,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + if (test_bit(R5_Insync, &dev->flags) && + test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { pr_debug("Read_old block " "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); @@ -4370,8 +4413,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) sh->group = NULL; } list_del_init(&sh->lru); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count) != 1); + BUG_ON(atomic_inc_return(&sh->count) != 1); return sh; } @@ -4401,7 +4443,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_UNPLUG_LIST clear but the stripe * is still in our list */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); /* * STRIPE_ON_RELEASE_LIST could be set here. In that @@ -5032,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); - handle_stripe(sh); release_stripe(sh); return STRIPE_SECTORS; @@ -5073,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 0); + sh = get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -5356,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, raid5_store_preread_threshold); static ssize_t +raid5_show_skip_copy(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->skip_copy); + else + return 0; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + new = !!new; + if (new == conf->skip_copy) + return len; + + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + return len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, + raid5_show_skip_copy, + raid5_store_skip_copy); + + +static ssize_t stripe_cache_active_show(struct mddev *mddev, char *page) { struct r5conf *conf = mddev->private; @@ -5440,6 +5526,7 @@ static struct attribute *raid5_attrs[] = { &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, &raid5_group_thread_cnt.attr, + &raid5_skip_copy.attr, NULL, }; static struct attribute_group raid5_attrs_group = { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 01ad8ae8f57..bc72cd4be5f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -232,7 +232,7 @@ struct stripe_head { */ struct bio req, rreq; struct bio_vec vec, rvec; - struct page *page; + struct page *page, *orig_page; struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; @@ -299,6 +299,7 @@ enum r5dev_flags { * data in, and now is a good time to write it out. */ R5_Discard, /* Discard the stripe */ + R5_SkipCopy, /* Don't copy data from bio to stripe cache */ }; /* @@ -436,6 +437,7 @@ struct r5conf { atomic_t pending_full_writes; /* full write backlog */ int bypass_count; /* bypassed prereads */ int bypass_threshold; /* preread nice */ + int skip_copy; /* Don't copy data from bio to stripe cache */ struct list_head *last_hold; /* detect hold_list promotions */ atomic_t reshape_stripes; /* stripes with pending writes for reshape */ |