summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h2
-rw-r--r--drivers/md/bcache/closure.h2
-rw-r--r--drivers/md/bitmap.c6
-rw-r--r--drivers/md/dm-bufio.c8
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-crypt.c61
-rw-r--r--drivers/md/dm-mpath.c12
-rw-r--r--drivers/md/dm-snap.c4
-rw-r--r--drivers/md/dm-thin.c106
-rw-r--r--drivers/md/dm-verity.c15
-rw-r--r--drivers/md/dm.c3
-rw-r--r--drivers/md/md.c20
-rw-r--r--drivers/md/raid10.c13
-rw-r--r--drivers/md/raid5.c163
-rw-r--r--drivers/md/raid5.h4
15 files changed, 292 insertions, 130 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 82c9c5d3525..d2ebcf32309 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -828,7 +828,7 @@ static inline bool cached_dev_get(struct cached_dev *dc)
return false;
/* Paired with the mb in cached_dev_attach */
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
return true;
}
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 7ef7461912b..a08e3eeac3c 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
cl->fn = fn;
cl->wq = wq;
/* between atomic_dec() in closure_put() */
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
}
static inline void closure_queue(struct closure *cl)
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a8e66ae04f..67f8b31e205 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -669,17 +669,13 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store,
/*
* return a pointer to the page in the filemap that contains the given bit
*
- * this lookup is complicated by the fact that the bitmap sb might be exactly
- * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
- * 0 or page 1
*/
static inline struct page *filemap_get_page(struct bitmap_storage *store,
unsigned long chunk)
{
if (file_page_index(store, chunk) >= store->file_pages)
return NULL;
- return store->filemap[file_page_index(store, chunk)
- - file_page_index(store, 0)];
+ return store->filemap[file_page_index(store, chunk)];
}
static int bitmap_storage_alloc(struct bitmap_storage *store,
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 66c5d130c8c..4e84095833d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -607,9 +607,9 @@ static void write_endio(struct bio *bio, int error)
BUG_ON(!test_bit(B_WRITING, &b->state));
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(B_WRITING, &b->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&b->state, B_WRITING);
}
@@ -997,9 +997,9 @@ static void read_endio(struct bio *bio, int error)
BUG_ON(!test_bit(B_READING, &b->state));
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(B_READING, &b->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&b->state, B_READING);
}
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1bf4a71919e..5f054c44b48 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2178,6 +2178,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
ti->discard_zeroes_data_unsupported = true;
+ /* Discard bios must be split on a block boundary */
+ ti->split_discard_bios = true;
cache->features = ca->features;
ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2488,6 +2490,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
} else {
inc_hit_counter(cache, bio);
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
!is_dirty(cache, lookup_result.cblock))
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 784695d22fd..53b213226c0 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -19,7 +19,6 @@
#include <linux/crypto.h>
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
-#include <linux/percpu.h>
#include <linux/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
@@ -43,6 +42,7 @@ struct convert_context {
struct bvec_iter iter_out;
sector_t cc_sector;
atomic_t cc_pending;
+ struct ablkcipher_request *req;
};
/*
@@ -111,15 +111,7 @@ struct iv_tcw_private {
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
/*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
- struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
*/
struct crypt_config {
struct dm_dev *dev;
@@ -150,12 +142,6 @@ struct crypt_config {
sector_t iv_offset;
unsigned int iv_size;
- /*
- * Duplicated per cpu state. Access through
- * per_cpu_ptr() only.
- */
- struct crypt_cpu __percpu *cpu;
-
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
struct crypto_ablkcipher **tfms;
@@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
- return this_cpu_ptr(cc->cpu);
-}
-
/*
* Use this to access cipher attributes that are the same for each CPU.
*/
@@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!this_cc->req)
- this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ if (!ctx->req)
+ ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
- ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
- ablkcipher_request_set_callback(this_cc->req,
+ ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ ablkcipher_request_set_callback(ctx->req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+ kcryptd_async_done, dmreq_of_req(cc, ctx->req));
}
/*
@@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
int r;
atomic_set(&ctx->cc_pending, 1);
@@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
- r = crypt_convert_block(cc, ctx, this_cc->req);
+ r = crypt_convert_block(cc, ctx, ctx->req);
switch (r) {
/* async */
@@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc,
reinit_completion(&ctx->restart);
/* fall through*/
case -EINPROGRESS:
- this_cc->req = NULL;
+ ctx->req = NULL;
ctx->cc_sector++;
continue;
@@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
io->sector = sector;
io->error = 0;
io->base_io = NULL;
+ io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
return io;
@@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (!atomic_dec_and_test(&io->io_pending))
return;
+ if (io->ctx.req)
+ mempool_free(io->ctx.req, cc->req_pool);
mempool_free(io, cc->io_pool);
if (likely(!base_io))
@@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = ti->private;
- struct crypt_cpu *cpu_cc;
- int cpu;
ti->private = NULL;
@@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
- if (cc->cpu)
- for_each_possible_cpu(cpu) {
- cpu_cc = per_cpu_ptr(cc->cpu, cpu);
- if (cpu_cc->req)
- mempool_free(cpu_cc->req, cc->req_pool);
- }
-
crypt_free_tfms(cc);
if (cc->bs)
@@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- if (cc->cpu)
- free_percpu(cc->cpu);
-
kzfree(cc->cipher);
kzfree(cc->cipher_string);
@@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
- cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
- __alignof__(struct crypt_cpu));
- if (!cc->cpu) {
- ti->error = "Cannot allocate per cpu state";
- goto bad_mem;
- }
-
/*
* For compatibility with the original dm-crypt mapping format, if
* only the cipher name is supplied, use cbc-plain.
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aa009e86587..ebfa411d1a7 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -445,11 +445,11 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
else
m->saved_queue_if_no_path = queue_if_no_path;
m->queue_if_no_path = queue_if_no_path;
- if (!m->queue_if_no_path)
- dm_table_run_md_queue_async(m->ti->table);
-
spin_unlock_irqrestore(&m->lock, flags);
+ if (!queue_if_no_path)
+ dm_table_run_md_queue_async(m->ti->table);
+
return 0;
}
@@ -954,7 +954,7 @@ out:
*/
static int reinstate_path(struct pgpath *pgpath)
{
- int r = 0;
+ int r = 0, run_queue = 0;
unsigned long flags;
struct multipath *m = pgpath->pg->m;
@@ -978,7 +978,7 @@ static int reinstate_path(struct pgpath *pgpath)
if (!m->nr_valid_paths++) {
m->current_pgpath = NULL;
- dm_table_run_md_queue_async(m->ti->table);
+ run_queue = 1;
} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
m->pg_init_in_progress++;
@@ -991,6 +991,8 @@ static int reinstate_path(struct pgpath *pgpath)
out:
spin_unlock_irqrestore(&m->lock, flags);
+ if (run_queue)
+ dm_table_run_md_queue_async(m->ti->table);
return r;
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ebddef5237e..8e0caed0bf7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -642,7 +642,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
struct dm_snapshot *s = pe->snap;
mempool_free(pe, s->pending_pool);
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&s->pending_exceptions_count);
}
@@ -783,7 +783,7 @@ static int init_hash_tables(struct dm_snapshot *s)
static void merge_shutdown(struct dm_snapshot *s)
{
clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&s->state_bits, RUNNING_MERGE);
}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 53728be84de..242ac2ea5f2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -27,6 +27,9 @@
#define MAPPING_POOL_SIZE 1024
#define PRISON_CELLS 1024
#define COMMIT_PERIOD HZ
+#define NO_SPACE_TIMEOUT_SECS 60
+
+static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
"A percentage of time allocated for copy on write");
@@ -175,6 +178,7 @@ struct pool {
struct workqueue_struct *wq;
struct work_struct worker;
struct delayed_work waker;
+ struct delayed_work no_space_timeout;
unsigned long last_commit_jiffies;
unsigned ref_count;
@@ -232,6 +236,13 @@ struct thin_c {
struct bio_list deferred_bio_list;
struct bio_list retry_on_resume_list;
struct rb_root sort_bio_list; /* sorted list of deferred bios */
+
+ /*
+ * Ensures the thin is not destroyed until the worker has finished
+ * iterating the active_thins list.
+ */
+ atomic_t refcount;
+ struct completion can_destroy;
};
/*----------------------------------------------------------------*/
@@ -928,7 +939,7 @@ static int commit(struct pool *pool)
{
int r;
- if (get_pool_mode(pool) != PM_WRITE)
+ if (get_pool_mode(pool) >= PM_READ_ONLY)
return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd);
@@ -1486,6 +1497,45 @@ static void process_thin_deferred_bios(struct thin_c *tc)
blk_finish_plug(&plug);
}
+static void thin_get(struct thin_c *tc);
+static void thin_put(struct thin_c *tc);
+
+/*
+ * We can't hold rcu_read_lock() around code that can block. So we
+ * find a thin with the rcu lock held; bump a refcount; then drop
+ * the lock.
+ */
+static struct thin_c *get_first_thin(struct pool *pool)
+{
+ struct thin_c *tc = NULL;
+
+ rcu_read_lock();
+ if (!list_empty(&pool->active_thins)) {
+ tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+ thin_get(tc);
+ }
+ rcu_read_unlock();
+
+ return tc;
+}
+
+static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
+{
+ struct thin_c *old_tc = tc;
+
+ rcu_read_lock();
+ list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
+ thin_get(tc);
+ thin_put(old_tc);
+ rcu_read_unlock();
+ return tc;
+ }
+ thin_put(old_tc);
+ rcu_read_unlock();
+
+ return NULL;
+}
+
static void process_deferred_bios(struct pool *pool)
{
unsigned long flags;
@@ -1493,10 +1543,11 @@ static void process_deferred_bios(struct pool *pool)
struct bio_list bios;
struct thin_c *tc;
- rcu_read_lock();
- list_for_each_entry_rcu(tc, &pool->active_thins, list)
+ tc = get_first_thin(pool);
+ while (tc) {
process_thin_deferred_bios(tc);
- rcu_read_unlock();
+ tc = get_next_thin(pool, tc);
+ }
/*
* If there are any deferred flush bios, we must commit
@@ -1543,6 +1594,20 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+/*
+ * We're holding onto IO to allow userland time to react. After the
+ * timeout either the pool will have been resized (and thus back in
+ * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ */
+static void do_no_space_timeout(struct work_struct *ws)
+{
+ struct pool *pool = container_of(to_delayed_work(ws), struct pool,
+ no_space_timeout);
+
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
+ set_pool_mode(pool, PM_READ_ONLY);
+}
+
/*----------------------------------------------------------------*/
struct noflush_work {
@@ -1578,7 +1643,7 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
{
struct noflush_work w;
- INIT_WORK(&w.worker, fn);
+ INIT_WORK_ONSTACK(&w.worker, fn);
w.tc = tc;
atomic_set(&w.complete, 0);
init_waitqueue_head(&w.wait);
@@ -1607,6 +1672,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
struct pool_c *pt = pool->ti->private;
bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
enum pool_mode old_mode = get_pool_mode(pool);
+ unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
/*
* Never allow the pool to transition to PM_WRITE mode if user
@@ -1668,6 +1734,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_discard = process_discard;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard_passdown;
+
+ if (!pool->pf.error_if_no_space && no_space_timeout)
+ queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
break;
case PM_WRITE:
@@ -2053,6 +2122,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
INIT_WORK(&pool->worker, do_worker);
INIT_DELAYED_WORK(&pool->waker, do_waker);
+ INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_flush_bios);
INIT_LIST_HEAD(&pool->prepared_mappings);
@@ -2615,6 +2685,7 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool *pool = pt->pool;
cancel_delayed_work(&pool->waker);
+ cancel_delayed_work(&pool->no_space_timeout);
flush_workqueue(pool->wq);
(void) commit(pool);
}
@@ -3061,11 +3132,25 @@ static struct target_type pool_target = {
/*----------------------------------------------------------------
* Thin target methods
*--------------------------------------------------------------*/
+static void thin_get(struct thin_c *tc)
+{
+ atomic_inc(&tc->refcount);
+}
+
+static void thin_put(struct thin_c *tc)
+{
+ if (atomic_dec_and_test(&tc->refcount))
+ complete(&tc->can_destroy);
+}
+
static void thin_dtr(struct dm_target *ti)
{
struct thin_c *tc = ti->private;
unsigned long flags;
+ thin_put(tc);
+ wait_for_completion(&tc->can_destroy);
+
spin_lock_irqsave(&tc->pool->lock, flags);
list_del_rcu(&tc->list);
spin_unlock_irqrestore(&tc->pool->lock, flags);
@@ -3101,6 +3186,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct thin_c *tc;
struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
+ unsigned long flags;
mutex_lock(&dm_thin_pool_table.mutex);
@@ -3191,9 +3277,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
mutex_unlock(&dm_thin_pool_table.mutex);
- spin_lock(&tc->pool->lock);
+ atomic_set(&tc->refcount, 1);
+ init_completion(&tc->can_destroy);
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
- spin_unlock(&tc->pool->lock);
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
/*
* This synchronize_rcu() call is needed here otherwise we risk a
* wake_worker() call finding no bios to process (because the newly
@@ -3422,6 +3511,9 @@ static void dm_thin_exit(void)
module_init(dm_thin_init);
module_exit(dm_thin_exit);
+module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
+
MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 796007a5e0e..7a7bab8947a 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -330,15 +330,17 @@ test_block_hash:
return r;
}
}
-
todo = 1 << v->data_dev_block_bits;
- while (io->iter.bi_size) {
+ do {
u8 *page;
+ unsigned len;
struct bio_vec bv = bio_iter_iovec(bio, io->iter);
page = kmap_atomic(bv.bv_page);
- r = crypto_shash_update(desc, page + bv.bv_offset,
- bv.bv_len);
+ len = bv.bv_len;
+ if (likely(len >= todo))
+ len = todo;
+ r = crypto_shash_update(desc, page + bv.bv_offset, len);
kunmap_atomic(page);
if (r < 0) {
@@ -346,8 +348,9 @@ test_block_hash:
return r;
}
- bio_advance_iter(bio, &io->iter, bv.bv_len);
- }
+ bio_advance_iter(bio, &io->iter, len);
+ todo -= len;
+ } while (todo);
if (!v->version) {
r = crypto_shash_update(desc, v->salt, v->salt_size);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 455e6491649..aa9e093343d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
clone->cmd = rq->cmd;
clone->cmd_len = rq->cmd_len;
clone->sense = rq->sense;
- clone->buffer = rq->buffer;
clone->end_io = end_clone_request;
clone->end_io_data = tio;
@@ -2447,7 +2446,7 @@ static void dm_wq_work(struct work_struct *work)
static void dm_queue_flush(struct mapped_device *md)
{
clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
queue_work(md->wq, &md->work);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8fda38d23e3..34846856dbc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3448,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->level = LEVEL_NONE;
return rv;
}
+ if (mddev->ro)
+ return -EROFS;
/* request to change the personality. Need to ensure:
* - array is not engaged in resync/recovery/reshape
@@ -3634,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
int err;
if (mddev->pers->check_reshape == NULL)
return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
mddev->new_layout = n;
err = mddev->pers->check_reshape(mddev);
if (err) {
@@ -3723,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
int err;
if (mddev->pers->check_reshape == NULL)
return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
mddev->new_chunk_sectors = n >> 9;
err = mddev->pers->check_reshape(mddev);
if (err) {
@@ -6135,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
*/
if (mddev->sync_thread)
return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
rdev_for_each(rdev, mddev) {
sector_t avail = rdev->sectors;
@@ -6157,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
/* change the number of raid disks */
if (mddev->pers->check_reshape == NULL)
return -EINVAL;
+ if (mddev->ro)
+ return -EROFS;
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
return -EINVAL;
@@ -7381,8 +7391,10 @@ void md_do_sync(struct md_thread *thread)
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return;
- if (mddev->ro) /* never try to sync a read-only array */
+ if (mddev->ro) {/* never try to sync a read-only array */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
+ }
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
@@ -7824,6 +7836,7 @@ void md_check_recovery(struct mddev *mddev)
/* There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk
*/
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
@@ -8330,7 +8343,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
if (a < s) {
/* we need to split this range */
if (bb->count >= MD_MAX_BADBLOCKS) {
- rv = 0;
+ rv = -ENOSPC;
goto out;
}
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
@@ -8516,7 +8529,8 @@ static int md_notify_reboot(struct notifier_block *this,
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
- mddev->safemode = 2;
+ if (mddev->persistent)
+ mddev->safemode = 2;
mddev_unlock(mddev);
}
need_delay = 1;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33fc408e5ea..cb882aae9e2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1172,6 +1172,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
int max_sectors;
int sectors;
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+ wait_barrier(conf);
+
sectors = bio_sectors(bio);
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
@@ -1552,12 +1559,6 @@ static void make_request(struct mddev *mddev, struct bio *bio)
md_write_start(mddev, bio);
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
do {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 25247a85291..6234b2e8458 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ if (atomic_read(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh)
int hash;
bool wakeup;
+ /* Avoid release_list until the last reference.
+ */
+ if (atomic_add_unless(&sh->count, -1, 1))
+ return;
+
if (unlikely(!conf->mddev->thread) ||
test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
goto slow_path;
@@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)
int num = sh->raid_conf->pool_size;
for (i = 0; i < num ; i++) {
+ WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
p = sh->dev[i].page;
if (!p)
continue;
@@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)
return 1;
}
sh->dev[i].page = page;
+ sh->dev[i].orig_page = page;
}
return 0;
}
@@ -855,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_NOMERGE;
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
@@ -899,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
else
rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->data_offset);
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
@@ -927,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
}
static struct dma_async_tx_descriptor *
-async_copy_data(int frombio, struct bio *bio, struct page *page,
- sector_t sector, struct dma_async_tx_descriptor *tx)
+async_copy_data(int frombio, struct bio *bio, struct page **page,
+ sector_t sector, struct dma_async_tx_descriptor *tx,
+ struct stripe_head *sh)
{
struct bio_vec bvl;
struct bvec_iter iter;
@@ -965,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
if (clen > 0) {
b_offset += bvl.bv_offset;
bio_page = bvl.bv_page;
- if (frombio)
- tx = async_memcpy(page, bio_page, page_offset,
+ if (frombio) {
+ if (sh->raid_conf->skip_copy &&
+ b_offset == 0 && page_offset == 0 &&
+ clen == STRIPE_SIZE)
+ *page = bio_page;
+ else
+ tx = async_memcpy(*page, bio_page, page_offset,
b_offset, clen, &submit);
- else
- tx = async_memcpy(bio_page, page, b_offset,
+ } else
+ tx = async_memcpy(bio_page, *page, b_offset,
page_offset, clen, &submit);
}
/* chain the operations */
@@ -1045,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh)
spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
- tx = async_copy_data(0, rbi, dev->page,
- dev->sector, tx);
+ tx = async_copy_data(0, rbi, &dev->page,
+ dev->sector, tx, sh);
rbi = r5_next_bio(rbi, dev->sector);
}
}
@@ -1384,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
BUG_ON(dev->written);
wbi = dev->written = chosen;
spin_unlock_irq(&sh->stripe_lock);
+ WARN_ON(dev->page != dev->orig_page);
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
@@ -1393,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
set_bit(R5_SyncIO, &dev->flags);
if (wbi->bi_rw & REQ_DISCARD)
set_bit(R5_Discard, &dev->flags);
- else
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
+ else {
+ tx = async_copy_data(1, wbi, &dev->page,
+ dev->sector, tx, sh);
+ if (dev->page != dev->orig_page) {
+ set_bit(R5_SkipCopy, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ clear_bit(R5_OVERWRITE, &dev->flags);
+ }
+ }
wbi = r5_next_bio(wbi, dev->sector);
}
}
@@ -1426,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- if (!discard)
+ if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
@@ -1839,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
osh = get_free_stripe(conf, hash);
unlock_device_hash_lock(conf, hash);
atomic_set(&nsh->count, 1);
- for(i=0; i<conf->pool_size; i++)
+ for(i=0; i<conf->pool_size; i++) {
nsh->dev[i].page = osh->dev[i].page;
+ nsh->dev[i].orig_page = osh->dev[i].page;
+ }
for( ; i<newsize; i++)
nsh->dev[i].page = NULL;
nsh->hash_lock_index = hash;
@@ -1896,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (nsh->dev[i].page == NULL) {
struct page *p = alloc_page(GFP_NOIO);
nsh->dev[i].page = p;
+ nsh->dev[i].orig_page = p;
if (!p)
err = -ENOMEM;
}
@@ -2133,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
+
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
bio_init(&dev->req);
dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_vcnt++;
- dev->req.bi_max_vecs++;
+ dev->req.bi_max_vecs = 1;
dev->req.bi_private = sh;
- dev->vec.bv_page = dev->page;
bio_init(&dev->rreq);
dev->rreq.bi_io_vec = &dev->rvec;
- dev->rreq.bi_vcnt++;
- dev->rreq.bi_max_vecs++;
+ dev->rreq.bi_max_vecs = 1;
dev->rreq.bi_private = sh;
- dev->rvec.bv_page = dev->page;
dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous);
@@ -2750,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
+ if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].page = sh->dev[i].orig_page;
+ }
+
if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
@@ -2886,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(s->failed >= 1 && fdev[0]->toread) ||
(s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
+ (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
!test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
+ (sh->raid_conf->level == 6 && s->failed && s->to_write &&
+ s->to_write < sh->raid_conf->raid_disks - 2 &&
+ (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -2991,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Discard, &dev->flags))) {
+ test_bit(R5_Discard, &dev->flags) ||
+ test_bit(R5_SkipCopy, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
if (test_and_clear_bit(R5_Discard, &dev->flags))
clear_bit(R5_UPTODATE, &dev->flags);
+ if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
+ dev->page = dev->orig_page;
+ }
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
@@ -3015,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
0);
} else if (test_bit(R5_Discard, &dev->flags))
discard_pending = 1;
+ WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
+ WARN_ON(dev->page != dev->orig_page);
}
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -3086,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
- if (test_bit(R5_Insync, &dev->flags)) rcw++;
+ if (test_bit(R5_Insync, &dev->flags))
+ rcw++;
else
rcw += 2*disks;
}
@@ -3107,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
test_bit(R5_Insync, &dev->flags)) {
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- pr_debug("Read_old block "
- "%d for r-m-w\n", i);
+ if (test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
+ pr_debug("Read_old block %d for r-m-w\n",
+ i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
@@ -3133,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rcw++;
- if (!test_bit(R5_Insync, &dev->flags))
- continue; /* it's a failed drive */
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ if (test_bit(R5_Insync, &dev->flags) &&
+ test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
pr_debug("Read_old block "
"%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
@@ -4370,8 +4413,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
sh->group = NULL;
}
list_del_init(&sh->lru);
- atomic_inc(&sh->count);
- BUG_ON(atomic_read(&sh->count) != 1);
+ BUG_ON(atomic_inc_return(&sh->count) != 1);
return sh;
}
@@ -4401,7 +4443,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
* STRIPE_ON_UNPLUG_LIST clear but the stripe
* is still in our list
*/
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
/*
* STRIPE_ON_RELEASE_LIST could be set here. In that
@@ -5032,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
- handle_stripe(sh);
release_stripe(sh);
return STRIPE_SECTORS;
@@ -5073,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, 0, 1, 0);
+ sh = get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -5356,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
raid5_store_preread_threshold);
static ssize_t
+raid5_show_skip_copy(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", conf->skip_copy);
+ else
+ return 0;
+}
+
+static ssize_t
+raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ unsigned long new;
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (!conf)
+ return -ENODEV;
+
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+ new = !!new;
+ if (new == conf->skip_copy)
+ return len;
+
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ return len;
+}
+
+static struct md_sysfs_entry
+raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
+ raid5_show_skip_copy,
+ raid5_store_skip_copy);
+
+
+static ssize_t
stripe_cache_active_show(struct mddev *mddev, char *page)
{
struct r5conf *conf = mddev->private;
@@ -5440,6 +5526,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_stripecache_active.attr,
&raid5_preread_bypass_threshold.attr,
&raid5_group_thread_cnt.attr,
+ &raid5_skip_copy.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 01ad8ae8f57..bc72cd4be5f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -232,7 +232,7 @@ struct stripe_head {
*/
struct bio req, rreq;
struct bio_vec vec, rvec;
- struct page *page;
+ struct page *page, *orig_page;
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
@@ -299,6 +299,7 @@ enum r5dev_flags {
* data in, and now is a good time to write it out.
*/
R5_Discard, /* Discard the stripe */
+ R5_SkipCopy, /* Don't copy data from bio to stripe cache */
};
/*
@@ -436,6 +437,7 @@ struct r5conf {
atomic_t pending_full_writes; /* full write backlog */
int bypass_count; /* bypassed prereads */
int bypass_threshold; /* preread nice */
+ int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
atomic_t reshape_stripes; /* stripes with pending writes for reshape */