diff options
Diffstat (limited to 'drivers/md')
31 files changed, 1090 insertions, 1063 deletions
diff --git a/drivers/md/.gitignore b/drivers/md/.gitignore deleted file mode 100644 index a7afec6b19c..00000000000 --- a/drivers/md/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -mktables -raid6altivec*.c -raid6int*.c -raid6tables.c diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1ba1e122e94..e4fb58db545 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1000,10 +1000,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) page = bitmap->sb_page; offset = sizeof(bitmap_super_t); if (!file) - read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index, count); + page = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index, count); } else if (file) { page = read_page(file, index, bitmap, count); offset = 0; @@ -1542,8 +1543,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) atomic_read(&bitmap->mddev->recovery_active) == 0); bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; - if (bitmap->mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3bdbb611570..d5b0e4c0e70 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -107,11 +107,10 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; - /* - * crypto related data - */ + char *cipher; + char *cipher_mode; + struct crypt_iv_operations *iv_gen_ops; - char *iv_mode; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; @@ -135,8 +134,6 @@ struct crypt_config { unsigned int dmreq_start; struct ablkcipher_request *req; - char cipher[CRYPTO_MAX_ALG_NAME]; - char chainmode[CRYPTO_MAX_ALG_NAME]; struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; @@ -999,82 +996,135 @@ static int crypt_wipe_key(struct crypt_config *cc) return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } -/* - * Construct an encryption mapping: - * <cipher> <key> <iv_offset> <dev_path> <start> - */ -static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +static void crypt_dtr(struct dm_target *ti) { - struct crypt_config *cc; - struct crypto_ablkcipher *tfm; - char *tmp; - char *cipher; - char *chainmode; - char *ivmode; - char *ivopts; - unsigned int key_size; - unsigned long long tmpll; + struct crypt_config *cc = ti->private; - if (argc != 5) { - ti->error = "Not enough arguments"; + ti->private = NULL; + + if (!cc) + return; + + if (cc->io_queue) + destroy_workqueue(cc->io_queue); + if (cc->crypt_queue) + destroy_workqueue(cc->crypt_queue); + + if (cc->bs) + bioset_free(cc->bs); + + if (cc->page_pool) + mempool_destroy(cc->page_pool); + if (cc->req_pool) + mempool_destroy(cc->req_pool); + if (cc->io_pool) + mempool_destroy(cc->io_pool); + + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) + cc->iv_gen_ops->dtr(cc); + + if (cc->tfm && !IS_ERR(cc->tfm)) + crypto_free_ablkcipher(cc->tfm); + + if (cc->dev) + dm_put_device(ti, cc->dev); + + kzfree(cc->cipher); + kzfree(cc->cipher_mode); + + /* Must zero key material before freeing */ + kzfree(cc); +} + +static int crypt_ctr_cipher(struct dm_target *ti, + char *cipher_in, char *key) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *cipher_api = NULL; + int ret = -EINVAL; + + /* Convert to crypto api definition? */ + if (strchr(cipher_in, '(')) { + ti->error = "Bad cipher specification"; return -EINVAL; } - tmp = argv[0]; + /* + * Legacy dm-crypt cipher specification + * cipher-mode-iv:ivopts + */ + tmp = cipher_in; cipher = strsep(&tmp, "-"); + + cc->cipher = kstrdup(cipher, GFP_KERNEL); + if (!cc->cipher) + goto bad_mem; + + if (tmp) { + cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); + if (!cc->cipher_mode) + goto bad_mem; + } + chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); if (tmp) - DMWARN("Unexpected additional cipher options"); - - key_size = strlen(argv[1]) >> 1; - - cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); - if (cc == NULL) { - ti->error = - "Cannot allocate transparent encryption context"; - return -ENOMEM; - } + DMWARN("Ignoring unexpected additional cipher options"); - /* Compatibility mode for old dm-crypt cipher strings */ - if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { + /* Compatibility mode for old dm-crypt mappings */ + if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { + kfree(cc->cipher_mode); + cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } if (strcmp(chainmode, "ecb") && !ivmode) { - ti->error = "This chaining mode requires an IV mechanism"; - goto bad_cipher; + ti->error = "IV mechanism required"; + return -EINVAL; } - if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", - chainmode, cipher) >= CRYPTO_MAX_ALG_NAME) { - ti->error = "Chain mode + cipher name is too long"; - goto bad_cipher; + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + if (ret < 0) { + kfree(cipher_api); + goto bad_mem; } - tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); - if (IS_ERR(tfm)) { + /* Allocate cipher */ + cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); + if (IS_ERR(cc->tfm)) { + ret = PTR_ERR(cc->tfm); ti->error = "Error allocating crypto tfm"; - goto bad_cipher; + goto bad; } - strcpy(cc->cipher, cipher); - strcpy(cc->chainmode, chainmode); - cc->tfm = tfm; - - if (crypt_set_key(cc, argv[1]) < 0) { + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { ti->error = "Error decoding and setting key"; - goto bad_ivmode; + goto bad; } - /* - * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". - * See comments at iv code - */ + /* Initialize IV */ + cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + if (cc->iv_size) + /* at least a 64 bit sector number should fit in our buffer */ + cc->iv_size = max(cc->iv_size, + (unsigned int)(sizeof(u64) / sizeof(u8))); + else if (ivmode) { + DMWARN("Selected cipher does not support IVs"); + ivmode = NULL; + } + /* Choose ivmode, see comments at iv code. */ if (ivmode == NULL) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) @@ -1088,159 +1138,138 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; else { + ret = -EINVAL; ti->error = "Invalid IV mode"; - goto bad_ivmode; + goto bad; } - if (cc->iv_gen_ops && cc->iv_gen_ops->ctr && - cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) - goto bad_ivmode; - - if (cc->iv_gen_ops && cc->iv_gen_ops->init && - cc->iv_gen_ops->init(cc) < 0) { - ti->error = "Error initialising IV"; - goto bad_slab_pool; + /* Allocate IV */ + if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { + ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); + if (ret < 0) { + ti->error = "Error creating IV"; + goto bad; + } } - cc->iv_size = crypto_ablkcipher_ivsize(tfm); - if (cc->iv_size) - /* at least a 64 bit sector number should fit in our buffer */ - cc->iv_size = max(cc->iv_size, - (unsigned int)(sizeof(u64) / sizeof(u8))); - else { - if (cc->iv_gen_ops) { - DMWARN("Selected cipher does not support IVs"); - if (cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - cc->iv_gen_ops = NULL; + /* Initialize IV (set keys for ESSIV etc) */ + if (cc->iv_gen_ops && cc->iv_gen_ops->init) { + ret = cc->iv_gen_ops->init(cc); + if (ret < 0) { + ti->error = "Error initialising IV"; + goto bad; } } + ret = 0; +bad: + kfree(cipher_api); + return ret; + +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +/* + * Construct an encryption mapping: + * <cipher> <key> <iv_offset> <dev_path> <start> + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc; + unsigned int key_size; + unsigned long long tmpll; + int ret; + + if (argc != 5) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + key_size = strlen(argv[1]) >> 1; + + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + if (!cc) { + ti->error = "Cannot allocate encryption context"; + return -ENOMEM; + } + + ti->private = cc; + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; + + ret = -ENOMEM; cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); if (!cc->io_pool) { ti->error = "Cannot allocate crypt io mempool"; - goto bad_slab_pool; + goto bad; } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + sizeof(struct dm_crypt_request) + cc->iv_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; - goto bad_req_pool; + goto bad; } cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; - goto bad_page_pool; + goto bad; } cc->bs = bioset_create(MIN_IOS, 0); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; - goto bad_bs; + goto bad; } + ret = -EINVAL; if (sscanf(argv[2], "%llu", &tmpll) != 1) { ti->error = "Invalid iv_offset sector"; - goto bad_device; + goto bad; } cc->iv_offset = tmpll; - if (sscanf(argv[4], "%llu", &tmpll) != 1) { - ti->error = "Invalid device sector"; - goto bad_device; - } - cc->start = tmpll; - if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { ti->error = "Device lookup failed"; - goto bad_device; + goto bad; } - if (ivmode && cc->iv_gen_ops) { - if (ivopts) - *(ivopts - 1) = ':'; - cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); - if (!cc->iv_mode) { - ti->error = "Error kmallocing iv_mode string"; - goto bad_ivmode_string; - } - strcpy(cc->iv_mode, ivmode); - } else - cc->iv_mode = NULL; + if (sscanf(argv[4], "%llu", &tmpll) != 1) { + ti->error = "Invalid device sector"; + goto bad; + } + cc->start = tmpll; + ret = -ENOMEM; cc->io_queue = create_singlethread_workqueue("kcryptd_io"); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; - goto bad_io_queue; + goto bad; } cc->crypt_queue = create_singlethread_workqueue("kcryptd"); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; - goto bad_crypt_queue; + goto bad; } ti->num_flush_requests = 1; - ti->private = cc; return 0; -bad_crypt_queue: - destroy_workqueue(cc->io_queue); -bad_io_queue: - kfree(cc->iv_mode); -bad_ivmode_string: - dm_put_device(ti, cc->dev); -bad_device: - bioset_free(cc->bs); -bad_bs: - mempool_destroy(cc->page_pool); -bad_page_pool: - mempool_destroy(cc->req_pool); -bad_req_pool: - mempool_destroy(cc->io_pool); -bad_slab_pool: - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); -bad_ivmode: - crypto_free_ablkcipher(tfm); -bad_cipher: - /* Must zero key material before freeing */ - kzfree(cc); - return -EINVAL; -} - -static void crypt_dtr(struct dm_target *ti) -{ - struct crypt_config *cc = (struct crypt_config *) ti->private; - - destroy_workqueue(cc->io_queue); - destroy_workqueue(cc->crypt_queue); - - if (cc->req) - mempool_free(cc->req, cc->req_pool); - - bioset_free(cc->bs); - mempool_destroy(cc->page_pool); - mempool_destroy(cc->req_pool); - mempool_destroy(cc->io_pool); - - kfree(cc->iv_mode); - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - crypto_free_ablkcipher(cc->tfm); - dm_put_device(ti, cc->dev); - - /* Must zero key material before freeing */ - kzfree(cc); +bad: + crypt_dtr(ti); + return ret; } static int crypt_map(struct dm_target *ti, struct bio *bio, @@ -1249,13 +1278,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct dm_crypt_io *io; struct crypt_config *cc; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { cc = ti->private; bio->bi_bdev = cc->dev->bdev; return DM_MAPIO_REMAPPED; } - io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); + io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); if (bio_data_dir(io->base_bio) == READ) kcryptd_queue_io(io); @@ -1268,7 +1297,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, static int crypt_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - struct crypt_config *cc = (struct crypt_config *) ti->private; + struct crypt_config *cc = ti->private; unsigned int sz = 0; switch (type) { @@ -1277,11 +1306,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->iv_mode) - DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode, - cc->iv_mode); + if (cc->cipher_mode) + DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); else - DMEMIT("%s-%s ", cc->cipher, cc->chainmode); + DMEMIT("%s ", cc->cipher); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -1378,7 +1406,7 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return max_size; bvm->bi_bdev = cc->dev->bdev; - bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin; + bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 852052880d7..baa11912cc9 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -198,6 +198,7 @@ out: atomic_set(&dc->may_delay, 1); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = dc; return 0; @@ -281,14 +282,13 @@ static int delay_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = dc->dev_write->bdev; if (bio_sectors(bio)) bio->bi_sector = dc->start_write + - (bio->bi_sector - ti->begin); + dm_target_offset(ti, bio->bi_sector); return delay_bio(dc, dc->write_delay, bio); } bio->bi_bdev = dc->dev_read->bdev; - bio->bi_sector = dc->start_read + - (bio->bi_sector - ti->begin); + bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); return delay_bio(dc, dc->read_delay, bio); } diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 2b7907b6dd0..0bdb201c2c2 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -173,7 +173,9 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, /* Validate the chunk size against the device block size */ if (chunk_size % - (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || + chunk_size % + (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index e8dfa06af3b..0b2536247cf 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -126,8 +126,9 @@ struct dm_exception_store { }; /* - * Obtain the cow device used by a given snapshot. + * Obtain the origin or cow device used by a given snapshot. */ +struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); /* diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 0590c75b0ab..136d4f71a11 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -31,7 +31,6 @@ struct dm_io_client { */ struct io { unsigned long error_bits; - unsigned long eopnotsupp_bits; atomic_t count; struct task_struct *sleeper; struct dm_io_client *client; @@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, *---------------------------------------------------------------*/ static void dec_count(struct io *io, unsigned int region, int error) { - if (error) { + if (error) set_bit(region, &io->error_bits); - if (error == -EOPNOTSUPP) - set_bit(region, &io->eopnotsupp_bits); - } if (atomic_dec_and_test(&io->count)) { if (io->sleeper) @@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where->count; /* - * where->count may be zero if rw holds a write barrier and we - * need to send a zero-sized barrier. + * where->count may be zero if rw holds a flush and we need to + * send a zero-sized flush. */ do { /* @@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count || (rw & REQ_HARDBARRIER)) + if (where[i].count || (rw & REQ_FLUSH)) do_region(rw, i, where + i, dp, io); } @@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return -EIO; } -retry: io->error_bits = 0; - io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = current; io->client = client; @@ -412,11 +406,6 @@ retry: } set_current_state(TASK_RUNNING); - if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) { - rw &= ~REQ_HARDBARRIER; - goto retry; - } - if (error_bits) *error_bits = io->error_bits; @@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; - io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; io->client = client; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d7500e1c26f..4b54618b415 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -249,55 +249,66 @@ static void __hash_remove(struct hash_cell *hc) static void dm_hash_remove_all(int keep_open_devices) { - int i, dev_skipped, dev_removed; + int i, dev_skipped; struct hash_cell *hc; - struct list_head *tmp, *n; + struct mapped_device *md; + +retry: + dev_skipped = 0; down_write(&_hash_lock); -retry: - dev_skipped = dev_removed = 0; for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_safe (tmp, n, _name_buckets + i) { - hc = list_entry(tmp, struct hash_cell, name_list); + list_for_each_entry(hc, _name_buckets + i, name_list) { + md = hc->md; + dm_get(md); - if (keep_open_devices && - dm_lock_for_deletion(hc->md)) { + if (keep_open_devices && dm_lock_for_deletion(md)) { + dm_put(md); dev_skipped++; continue; } + __hash_remove(hc); - dev_removed = 1; - } - } - /* - * Some mapped devices may be using other mapped devices, so if any - * still exist, repeat until we make no further progress. - */ - if (dev_skipped) { - if (dev_removed) - goto retry; + up_write(&_hash_lock); - DMWARN("remove_all left %d open device(s)", dev_skipped); + dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; + } } up_write(&_hash_lock); + + if (dev_skipped) + DMWARN("remove_all left %d open device(s)", dev_skipped); } -static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, - const char *new) +static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, + const char *new) { char *new_name, *old_name; struct hash_cell *hc; struct dm_table *table; + struct mapped_device *md; /* * duplicate new. */ new_name = kstrdup(new, GFP_KERNEL); if (!new_name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -306,24 +317,24 @@ static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, */ hc = __get_name_cell(new); if (hc) { - DMWARN("asked to rename to an already existing name %s -> %s", - old, new); + DMWARN("asked to rename to an already-existing name %s -> %s", + param->name, new); dm_put(hc->md); up_write(&_hash_lock); kfree(new_name); - return -EBUSY; + return ERR_PTR(-EBUSY); } /* * Is there such a device as 'old' ? */ - hc = __get_name_cell(old); + hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non existent device %s -> %s", - old, new); + DMWARN("asked to rename a non-existent device %s -> %s", + param->name, new); up_write(&_hash_lock); kfree(new_name); - return -ENXIO; + return ERR_PTR(-ENXIO); } /* @@ -345,13 +356,14 @@ static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, dm_table_put(table); } - if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie)) - *flags |= DM_UEVENT_GENERATED_FLAG; + if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; - dm_put(hc->md); + md = hc->md; up_write(&_hash_lock); kfree(old_name); - return 0; + + return md; } /*----------------------------------------------------------------- @@ -573,7 +585,7 @@ static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, * Fills in a dm_ioctl structure, ready for sending back to * userland. */ -static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; @@ -617,8 +629,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) dm_table_put(table); } } - - return 0; } static int dev_create(struct dm_ioctl *param, size_t param_size) @@ -640,15 +650,17 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); if (r) { dm_put(md); + dm_destroy(md); return r; } param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); + dm_put(md); - return r; + return 0; } /* @@ -742,6 +754,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) param->flags |= DM_UEVENT_GENERATED_FLAG; dm_put(md); + dm_destroy(md); return 0; } @@ -762,6 +775,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; char *new_name = (char *) param + param->data_start; + struct mapped_device *md; if (new_name < param->data || invalid_str(new_name, (void *) param + param_size) || @@ -774,10 +788,14 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) if (r) return r; - param->data_size = 0; + md = dm_hash_rename(param, new_name); + if (IS_ERR(md)) + return PTR_ERR(md); + + __dev_status(md, param); + dm_put(md); - return dm_hash_rename(param->event_nr, ¶m->flags, param->name, - new_name); + return 0; } static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) @@ -818,8 +836,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) geometry.start = indata[3]; r = dm_set_geometry(md, &geometry); - if (!r) - r = __dev_status(md, param); param->data_size = 0; @@ -843,13 +859,17 @@ static int do_suspend(struct dm_ioctl *param) if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended_md(md)) + if (!dm_suspended_md(md)) { r = dm_suspend(md, suspend_flags); + if (r) + goto out; + } - if (!r) - r = __dev_status(md, param); + __dev_status(md, param); +out: dm_put(md); + return r; } @@ -911,7 +931,7 @@ static int do_resume(struct dm_ioctl *param) dm_table_destroy(old_map); if (!r) - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); return r; @@ -935,16 +955,16 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size) */ static int dev_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); - return r; + + return 0; } /* @@ -1019,7 +1039,7 @@ static void retrieve_status(struct dm_table *table, */ static int dev_wait(struct dm_ioctl *param, size_t param_size) { - int r; + int r = 0; struct mapped_device *md; struct dm_table *table; @@ -1040,9 +1060,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) * changed to trigger the event, so we may as well tell * him and save an ioctl. */ - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1050,8 +1068,9 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } - out: +out: dm_put(md); + return r; } @@ -1112,28 +1131,9 @@ static int populate_table(struct dm_table *table, next = spec->next; } - r = dm_table_set_type(table); - if (r) { - DMWARN("unable to set table type"); - return r; - } - return dm_table_complete(table); } -static int table_prealloc_integrity(struct dm_table *t, - struct mapped_device *md) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd; - - list_for_each_entry(dd, devices, list) - if (bdev_get_integrity(dd->dm_dev.bdev)) - return blk_integrity_register(dm_disk(md), NULL); - - return 0; -} - static int table_load(struct dm_ioctl *param, size_t param_size) { int r; @@ -1155,21 +1155,30 @@ static int table_load(struct dm_ioctl *param, size_t param_size) goto out; } - r = table_prealloc_integrity(t, md); - if (r) { - DMERR("%s: could not register integrity profile.", - dm_device_name(md)); + /* Protect md->type and md->queue against concurrent table loads. */ + dm_lock_md_type(md); + if (dm_get_md_type(md) == DM_TYPE_NONE) + /* Initial table load: acquire type of table. */ + dm_set_md_type(md, dm_table_get_type(t)); + else if (dm_get_md_type(md) != dm_table_get_type(t)) { + DMWARN("can't change device type after initial table load."); dm_table_destroy(t); + dm_unlock_md_type(md); + r = -EINVAL; goto out; } - r = dm_table_alloc_md_mempools(t); + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md); if (r) { - DMWARN("unable to allocate mempools for this table"); + DMWARN("unable to set up device queue for new table."); dm_table_destroy(t); + dm_unlock_md_type(md); goto out; } + dm_unlock_md_type(md); + /* stage inactive table */ down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { @@ -1186,7 +1195,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) up_write(&_hash_lock); param->flags |= DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); out: dm_put(md); @@ -1196,7 +1205,6 @@ out: static int table_clear(struct dm_ioctl *param, size_t param_size) { - int r; struct hash_cell *hc; struct mapped_device *md; @@ -1216,11 +1224,12 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(hc->md, param); + __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); dm_put(md); - return r; + + return 0; } /* @@ -1265,7 +1274,6 @@ static void retrieve_deps(struct dm_table *table, static int table_deps(struct dm_ioctl *param, size_t param_size) { - int r = 0; struct mapped_device *md; struct dm_table *table; @@ -1273,9 +1281,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1283,9 +1289,9 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } - out: dm_put(md); - return r; + + return 0; } /* @@ -1294,7 +1300,6 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) */ static int table_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; struct dm_table *table; @@ -1302,9 +1307,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1312,9 +1315,9 @@ static int table_status(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } -out: dm_put(md); - return r; + + return 0; } /* @@ -1333,10 +1336,6 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; - if (tmsg < (struct dm_target_msg *) param->data || invalid_str(tmsg->message, (void *) param + param_size)) { DMWARN("Invalid target message parameters."); @@ -1593,18 +1592,23 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) #endif static const struct file_operations _ctl_fops = { + .open = nonseekable_open, .unlocked_ioctl = dm_ctl_ioctl, .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, + .llseek = noop_llseek, }; static struct miscdevice _dm_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = MAPPER_CTRL_MINOR, .name = DM_NAME, - .nodename = "mapper/control", + .nodename = DM_DIR "/" DM_CONTROL_NODE, .fops = &_ctl_fops }; +MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); +MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); + /* * Create misc character device and link to DM_DIR/control. */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9200dbf2391..3921e3bb43c 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = lc; return 0; @@ -73,7 +74,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { struct linear_c *lc = ti->private; - return lc->start + (bi_sector - ti->begin); + return lc->start + dm_target_offset(ti, bi_sector); } static void linear_map_bio(struct dm_target *ti, struct bio *bio) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 5a08be0222d..33420e68d15 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc) .count = 0, }; - lc->io_req.bi_rw = WRITE_BARRIER; + lc->io_req.bi_rw = WRITE_FLUSH; return dm_io(&lc->io_req, 1, &null_location, NULL); } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 826bce7343b..487ecda90ad 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -706,6 +706,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, if (as->argc < nr_params) { ti->error = "not enough path parameters"; + r = -EINVAL; goto bad; } @@ -892,6 +893,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; return 0; @@ -1271,6 +1273,15 @@ static int do_end_io(struct multipath *m, struct request *clone, if (error == -EOPNOTSUPP) return error; + if (clone->cmd_flags & REQ_DISCARD) + /* + * Pass all discard request failures up. + * FIXME: only fail_path if the discard failed due to a + * transport problem. This requires precise understanding + * of the underlying failure (e.g. the SCSI sense). + */ + return error; + if (mpio->pgpath) fail_path(mpio->pgpath); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 74136262d65..19a59b041c2 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_region io[ms->nr_mirrors]; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE_BARRIER, + .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.bvec = NULL, .client = ms->io_client, @@ -445,7 +445,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio) { if (unlikely(!bio->bi_size)) return 0; - return m->offset + (bio->bi_sector - m->ms->ti->begin); + return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); } static void map_bio(struct mirror *m, struct bio *bio) @@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct dm_io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), + .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), .mem.type = DM_IO_BVEC, .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, .notify.fn = write_callback, @@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { bio_list_add(&sync, bio); continue; } @@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (likely(!bio_empty_barrier(bio))) + if (!(bio->bi_rw & REQ_FLUSH)) dm_rh_dec(ms->rh, map_context->ll); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index bd5c58b2886..dad011aed0c 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -81,9 +81,9 @@ struct dm_region_hash { struct list_head failed_recovered_regions; /* - * If there was a barrier failure no regions can be marked clean. + * If there was a flush failure no regions can be marked clean. */ - int barrier_failure; + int flush_failure; void *context; sector_t target_begin; @@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create( INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); INIT_LIST_HEAD(&rh->failed_recovered_regions); - rh->barrier_failure = 0; + rh->flush_failure = 0; rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, sizeof(struct dm_region)); @@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; - if (bio_empty_barrier(bio)) { - rh->barrier_failure = 1; + if (bio->bi_rw & REQ_FLUSH) { + rh->flush_failure = 1; return; } @@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio_empty_barrier(bio)) + if (bio->bi_rw & REQ_FLUSH) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } @@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) */ /* do nothing for DM_RH_NOSYNC */ - if (unlikely(rh->barrier_failure)) { + if (unlikely(rh->flush_failure)) { /* - * If a write barrier failed some time ago, we + * If a write flush failed some time ago, we * don't know whether or not this write made it * to the disk, so we must resync the device. */ diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index c097d8a4823..0b61792a278 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -266,7 +266,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ static chunk_t area_location(struct pstore *ps, chunk_t area) { - return 1 + ((ps->exceptions_per_area + 1) * area); + return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); } /* @@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, WRITE_BARRIER)) + if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) ps->valid = 0; /* @@ -780,8 +780,8 @@ static int persistent_commit_merge(struct dm_exception_store *store, * ps->current_area does not get reduced by prepare_merge() until * after commit_merge() has removed the nr_merged previous exceptions. */ - ps->next_free = (area_location(ps, ps->current_area) - 1) + - (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; + ps->next_free = area_location(ps, ps->current_area) + + ps->current_committed + 1; return 0; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 54853773510..53cf79d8bcb 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -148,6 +148,12 @@ struct dm_snapshot { #define RUNNING_MERGE 0 #define SHUTDOWN_MERGE 1 +struct dm_dev *dm_snap_origin(struct dm_snapshot *s) +{ + return s->origin; +} +EXPORT_SYMBOL(dm_snap_origin); + struct dm_dev *dm_snap_cow(struct dm_snapshot *s) { return s->cow; @@ -700,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) return 0; } -#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) - /* * Return a minimum chunk size of all snapshots that have the specified origin. * Return zero if the origin has no snapshots. @@ -1065,10 +1069,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) origin_mode = FMODE_WRITE; } - origin_path = argv[0]; - argv++; - argc--; - s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { ti->error = "Cannot allocate snapshot context private " @@ -1077,6 +1077,16 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + origin_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, origin_path, origin_mode, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad_origin; + } + cow_path = argv[0]; argv++; argc--; @@ -1097,12 +1107,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv += args_used; argc -= args_used; - r = dm_get_device(ti, origin_path, origin_mode, &s->origin); - if (r) { - ti->error = "Cannot get origin device"; - goto bad_origin; - } - s->ti = ti; s->valid = 1; s->active = 0; @@ -1212,15 +1216,15 @@ bad_kcopyd: dm_exception_table_exit(&s->complete, exception_cache); bad_hash_tables: - dm_put_device(ti, s->origin); - -bad_origin: dm_exception_store_destroy(s->store); bad_store: dm_put_device(ti, s->cow); bad_cow: + dm_put_device(ti, s->origin); + +bad_origin: kfree(s); bad: @@ -1314,12 +1318,12 @@ static void snapshot_dtr(struct dm_target *ti) mempool_destroy(s->pending_pool); - dm_put_device(ti, s->origin); - dm_exception_store_destroy(s->store); dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + kfree(s); } @@ -1581,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1685,8 +1689,8 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, int r = DM_MAPIO_REMAPPED; chunk_t chunk; - if (unlikely(bio_empty_barrier(bio))) { - if (!map_context->flush_request) + if (bio->bi_rw & REQ_FLUSH) { + if (!map_context->target_request_nr) bio->bi_bdev = s->origin->bdev; else bio->bi_bdev = s->cow->bdev; @@ -1899,8 +1903,14 @@ static int snapshot_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dm_snapshot *snap = ti->private; + int r; + + r = fn(ti, snap->origin, 0, ti->len, data); + + if (!r) + r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); - return fn(ti, snap->origin, 0, ti->len, data); + return r; } @@ -2123,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio, struct dm_dev *dev = ti->private; bio->bi_bdev = dev->bdev; - if (unlikely(bio_empty_barrier(bio))) + if (bio->bi_rw & REQ_FLUSH) return DM_MAPIO_REMAPPED; /* Only tell snapshots if this is a write */ @@ -2159,6 +2169,21 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, return 0; } +static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_dev *dev = ti->private; + struct request_queue *q = bdev_get_queue(dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = dev->bdev; + bvm->bi_sector = bvm->bi_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -2176,6 +2201,7 @@ static struct target_type origin_target = { .map = origin_map, .resume = origin_resume, .status = origin_status, + .merge = origin_merge, .iterate_devices = origin_iterate_devices, }; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d6e28d732b4..f0371b4c4fb 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -25,6 +25,8 @@ struct stripe { struct stripe_c { uint32_t stripes; + int stripes_shift; + sector_t stripes_mask; /* The size of this target / num. stripes */ sector_t stripe_width; @@ -162,16 +164,22 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; - sc->stripes = stripes; sc->stripe_width = width; + + if (stripes & (stripes - 1)) + sc->stripes_shift = -1; + else { + sc->stripes_shift = ffs(stripes) - 1; + sc->stripes_mask = ((sector_t) stripes) - 1; + } + ti->split_io = chunk_size; ti->num_flush_requests = stripes; + ti->num_discard_requests = stripes; + sc->chunk_shift = ffs(chunk_size) - 1; sc->chunk_mask = ((sector_t) chunk_size) - 1; - for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) - chunk_size >>= 1; - sc->chunk_shift--; /* * Get the stripe destinations. @@ -207,26 +215,79 @@ static void stripe_dtr(struct dm_target *ti) kfree(sc); } +static void stripe_map_sector(struct stripe_c *sc, sector_t sector, + uint32_t *stripe, sector_t *result) +{ + sector_t offset = dm_target_offset(sc->ti, sector); + sector_t chunk = offset >> sc->chunk_shift; + + if (sc->stripes_shift < 0) + *stripe = sector_div(chunk, sc->stripes); + else { + *stripe = chunk & sc->stripes_mask; + chunk >>= sc->stripes_shift; + } + + *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); +} + +static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, + uint32_t target_stripe, sector_t *result) +{ + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, result); + if (stripe == target_stripe) + return; + *result &= ~sc->chunk_mask; /* round down */ + if (target_stripe < stripe) + *result += sc->chunk_mask + 1; /* next chunk */ +} + +static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, + uint32_t target_stripe) +{ + sector_t begin, end; + + stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); + stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), + target_stripe, &end); + if (begin < end) { + bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; + bio->bi_size = to_bytes(end - begin); + return DM_MAPIO_REMAPPED; + } else { + /* The range doesn't map to the target stripe */ + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } +} + static int stripe_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct stripe_c *sc = (struct stripe_c *) ti->private; - sector_t offset, chunk; + struct stripe_c *sc = ti->private; uint32_t stripe; + unsigned target_request_nr; - if (unlikely(bio_empty_barrier(bio))) { - BUG_ON(map_context->flush_request >= sc->stripes); - bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; + if (bio->bi_rw & REQ_FLUSH) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; return DM_MAPIO_REMAPPED; } + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + return stripe_map_discard(sc, bio, target_request_nr); + } - offset = bio->bi_sector - ti->begin; - chunk = offset >> sc->chunk_shift; - stripe = sector_div(chunk, sc->stripes); + stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); + bio->bi_sector += sc->stripe[stripe].physical_start; bio->bi_bdev = sc->stripe[stripe].dev->bdev; - bio->bi_sector = sc->stripe[stripe].physical_start + - (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return DM_MAPIO_REMAPPED; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9924ea23032..90267f8d64e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -54,6 +54,8 @@ struct dm_table { sector_t *highs; struct dm_target *targets; + unsigned discards_supported:1; + /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ @@ -203,6 +205,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); atomic_set(&t->holders, 0); + t->discards_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -245,7 +248,7 @@ void dm_table_destroy(struct dm_table *t) msleep(1); smp_mb(); - /* free the indexes (see dm_table_complete) */ + /* free the indexes */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -483,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, return 0; } -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -770,6 +768,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + if (!tgt->num_discard_requests) + t->discards_supported = 0; + return 0; bad: @@ -778,7 +779,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } -int dm_table_set_type(struct dm_table *t) +static int dm_table_set_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0; @@ -900,7 +901,7 @@ static int setup_indexes(struct dm_table *t) /* * Builds the btree to index the map. */ -int dm_table_complete(struct dm_table *t) +static int dm_table_build_index(struct dm_table *t) { int r = 0; unsigned int leaf_nodes; @@ -919,6 +920,55 @@ int dm_table_complete(struct dm_table *t) return r; } +/* + * Register the mapped device for blk_integrity support if + * the underlying devices support it. + */ +static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd; + + list_for_each_entry(dd, devices, list) + if (bdev_get_integrity(dd->dm_dev.bdev)) + return blk_integrity_register(dm_disk(md), NULL); + + return 0; +} + +/* + * Prepares the table for use by building the indices, + * setting the type, and allocating mempools. + */ +int dm_table_complete(struct dm_table *t) +{ + int r; + + r = dm_table_set_type(t); + if (r) { + DMERR("unable to set table type"); + return r; + } + + r = dm_table_build_index(t); + if (r) { + DMERR("unable to build btrees"); + return r; + } + + r = dm_table_prealloc_integrity(t, t->md); + if (r) { + DMERR("could not register integrity profile."); + return r; + } + + r = dm_table_alloc_md_mempools(t); + if (r) + DMERR("unable to allocate mempools"); + + return r; +} + static DEFINE_MUTEX(_event_lock); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context) @@ -1086,6 +1136,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); + if (!dm_table_supports_discards(t)) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + dm_table_set_integrity(t); /* @@ -1232,6 +1287,39 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) return t->md; } +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + if (!t->discards_supported) + return 0; + + /* + * Ensure that at least one underlying device supports discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 11dea11dc0b..8da366cf381 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -113,6 +113,11 @@ void dm_unregister_target(struct target_type *tt) */ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { + /* + * Return error for discards instead of -EOPNOTSUPP + */ + tt->num_discard_requests = 1; + return 0; } diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index bbc97030c0c..cc2b3cb8194 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -22,6 +22,11 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } + /* + * Silently drop discards, avoiding -EOPNOTSUPP. + */ + ti->num_discard_requests = 1; + return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a3f21dc02bd..7cb1352f7e7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -15,11 +15,11 @@ #include <linux/blkpg.h> #include <linux/bio.h> #include <linux/buffer_head.h> -#include <linux/smp_lock.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/hdreg.h> +#include <linux/delay.h> #include <trace/events/block.h> @@ -32,6 +32,7 @@ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 +static DEFINE_MUTEX(dm_mutex); static const char *_name = DM_NAME; static unsigned int major = 0; @@ -109,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_QUEUE_IO_TO_THREAD 6 /* * Work processed by per-device workqueue. @@ -124,6 +124,10 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + unsigned type; + /* Protect queue and type against concurrent access. */ + struct mutex type_lock; + struct gendisk *disk; char name[16]; @@ -139,24 +143,9 @@ struct mapped_device { spinlock_t deferred_lock; /* - * An error from the barrier request currently being processed. - */ - int barrier_error; - - /* - * Protect barrier_error from concurrent endio processing - * in request-based dm. - */ - spinlock_t barrier_error_lock; - - /* - * Processing queue (flush/barriers) + * Processing queue (flush) */ struct workqueue_struct *wq; - struct work_struct barrier_work; - - /* A pointer to the currently processing pre/post flush request */ - struct request *flush_request; /* * The current mapping. @@ -195,8 +184,8 @@ struct mapped_device { /* sysfs handle */ struct kobject kobj; - /* zero-length barrier that will be cloned and submitted to targets */ - struct bio barrier_bio; + /* zero-length flush that will be cloned and submitted to targets */ + struct bio flush_bio; }; /* @@ -339,7 +328,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; - lock_kernel(); + mutex_lock(&dm_mutex); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -357,7 +346,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); - unlock_kernel(); + mutex_unlock(&dm_mutex); return md ? 0 : -ENXIO; } @@ -366,10 +355,10 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; - lock_kernel(); + mutex_lock(&dm_mutex); atomic_dec(&md->open_count); dm_put(md); - unlock_kernel(); + mutex_unlock(&dm_mutex); return 0; } @@ -507,7 +496,7 @@ static void end_io_acct(struct dm_io *io) /* * After this is decremented the bio must not be touched if it is - * a barrier. + * a flush. */ dm_disk(md)->part0.in_flight[rw] = pending = atomic_dec_return(&md->pending[rw]); @@ -523,16 +512,12 @@ static void end_io_acct(struct dm_io *io) */ static void queue_io(struct mapped_device *md, struct bio *bio) { - down_write(&md->io_lock); + unsigned long flags; - spin_lock_irq(&md->deferred_lock); + spin_lock_irqsave(&md->deferred_lock, flags); bio_list_add(&md->deferred, bio); - spin_unlock_irq(&md->deferred_lock); - - if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) - queue_work(md->wq, &md->work); - - up_write(&md->io_lock); + spin_unlock_irqrestore(&md->deferred_lock, flags); + queue_work(md->wq, &md->work); } /* @@ -620,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error) * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) { - if (!(io->bio->bi_rw & REQ_HARDBARRIER)) - bio_list_add_head(&md->deferred, - io->bio); - } else + if (__noflush_suspending(md)) + bio_list_add_head(&md->deferred, io->bio); + else /* noflush suspend was interrupted. */ io->error = -EIO; spin_unlock_irqrestore(&md->deferred_lock, flags); @@ -632,26 +615,23 @@ static void dec_pending(struct dm_io *io, int error) io_error = io->error; bio = io->bio; + end_io_acct(io); + free_io(md, io); + + if (io_error == DM_ENDIO_REQUEUE) + return; - if (bio->bi_rw & REQ_HARDBARRIER) { + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { /* - * There can be just one barrier request so we use - * a per-device variable for error reporting. - * Note that you can't touch the bio after end_io_acct + * Preflush done for flush with data, reissue + * without REQ_FLUSH. */ - if (!md->barrier_error && io_error != -EOPNOTSUPP) - md->barrier_error = io_error; - end_io_acct(io); - free_io(md, io); + bio->bi_rw &= ~REQ_FLUSH; + queue_io(md, bio); } else { - end_io_acct(io); - free_io(md, io); - - if (io_error != DM_ENDIO_REQUEUE) { - trace_block_bio_complete(md->queue, bio); - - bio_endio(bio, io_error); - } + /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio); + bio_endio(bio, io_error); } } } @@ -744,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error) blk_update_request(tio->orig, 0, nr_bytes); } -static void store_barrier_error(struct mapped_device *md, int error) -{ - unsigned long flags; - - spin_lock_irqsave(&md->barrier_error_lock, flags); - /* - * Basically, the first error is taken, but: - * -EOPNOTSUPP supersedes any I/O error. - * Requeue request supersedes any I/O error but -EOPNOTSUPP. - */ - if (!md->barrier_error || error == -EOPNOTSUPP || - (md->barrier_error != -EOPNOTSUPP && - error == DM_ENDIO_REQUEUE)) - md->barrier_error = error; - spin_unlock_irqrestore(&md->barrier_error_lock, flags); -} - /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. @@ -798,13 +761,11 @@ static void free_rq_clone(struct request *clone) static void dm_end_request(struct request *clone, int error) { int rw = rq_data_dir(clone); - int run_queue = 1; - bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER; struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { rq->errors = clone->errors; rq->resid_len = clone->resid_len; @@ -818,15 +779,8 @@ static void dm_end_request(struct request *clone, int error) } free_rq_clone(clone); - - if (unlikely(is_barrier)) { - if (unlikely(error)) - store_barrier_error(md, error); - run_queue = 0; - } else - blk_end_request_all(rq, error); - - rq_completed(md, rw, run_queue); + blk_end_request_all(rq, error); + rq_completed(md, rw, true); } static void dm_unprep_request(struct request *rq) @@ -851,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone) struct request_queue *q = rq->q; unsigned long flags; - if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { - /* - * Barrier clones share an original request. - * Leave it to dm_end_request(), which handles this special - * case. - */ - dm_end_request(clone, DM_ENDIO_REQUEUE); - return; - } - dm_unprep_request(rq); spin_lock_irqsave(q->queue_lock, flags); @@ -950,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; - if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { - /* - * Barrier clones share an original request. So can't use - * softirq_done with the original. - * Pass the clone to dm_done() directly in this special case. - * It is safe (even if clone->q->queue_lock is held here) - * because there is no I/O dispatching during the completion - * of barrier clone. - */ - dm_done(clone, error, true); - return; - } - tio->error = error; rq->completion_data = clone; blk_complete_request(rq); @@ -979,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; - if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { - /* - * Barrier clones share an original request. - * Leave it to dm_end_request(), which handles this special - * case. - */ - BUG_ON(error > 0); - dm_end_request(clone, error); - return; - } - rq->cmd_flags |= REQ_FAILED; dm_complete_request(clone, error); } @@ -1019,17 +939,27 @@ static void end_clone_request(struct request *clone, int error) dm_complete_request(clone, error); } -static sector_t max_io_len(struct mapped_device *md, - sector_t sector, struct dm_target *ti) +/* + * Return maximum size of I/O possible at the supplied sector up to the current + * target boundary. + */ +static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) { - sector_t offset = sector - ti->begin; - sector_t len = ti->len - offset; + sector_t target_offset = dm_target_offset(ti, sector); + + return ti->len - target_offset; +} + +static sector_t max_io_len(sector_t sector, struct dm_target *ti) +{ + sector_t len = max_io_len_target_boundary(sector, ti); /* * Does the target need to split even further ? */ if (ti->split_io) { sector_t boundary; + sector_t offset = dm_target_offset(ti, sector); boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) - offset; if (len > boundary) @@ -1098,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio) } /* - * Creates a little bio that is just does part of a bvec. + * Creates a little bio that just does part of a bvec. */ static struct bio *split_bvec(struct bio *bio, sector_t sector, unsigned short idx, unsigned int offset, @@ -1113,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_sector = sector; clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; + clone->bi_rw = bio->bi_rw; clone->bi_vcnt = 1; clone->bi_size = to_bytes(len); clone->bi_io_vec->bv_offset = offset; @@ -1140,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); __bio_clone(clone, bio); - clone->bi_rw &= ~REQ_HARDBARRIER; clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; @@ -1171,32 +1100,91 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, return tio; } -static void __flush_target(struct clone_info *ci, struct dm_target *ti, - unsigned flush_nr) +static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, + unsigned request_nr, sector_t len) { struct dm_target_io *tio = alloc_tio(ci, ti); struct bio *clone; - tio->info.flush_request = flush_nr; + tio->info.target_request_nr = request_nr; - clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); + /* + * Discard requests require the bio's inline iovecs be initialized. + * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush + * and discard, so no need for concern about wasted bvec allocations. + */ + clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); __bio_clone(clone, ci->bio); clone->bi_destructor = dm_bio_destructor; + if (len) { + clone->bi_sector = ci->sector; + clone->bi_size = to_bytes(len); + } __map_bio(ti, clone, tio); } -static int __clone_and_map_empty_barrier(struct clone_info *ci) +static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, + unsigned num_requests, sector_t len) { - unsigned target_nr = 0, flush_nr; + unsigned request_nr; + + for (request_nr = 0; request_nr < num_requests; request_nr++) + __issue_target_request(ci, ti, request_nr, len); +} + +static int __clone_and_map_empty_flush(struct clone_info *ci) +{ + unsigned target_nr = 0; struct dm_target *ti; + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) - for (flush_nr = 0; flush_nr < ti->num_flush_requests; - flush_nr++) - __flush_target(ci, ti, flush_nr); + __issue_target_requests(ci, ti, ti->num_flush_requests, 0); + return 0; +} + +/* + * Perform all io with a single clone. + */ +static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) +{ + struct bio *clone, *bio = ci->bio; + struct dm_target_io *tio; + + tio = alloc_tio(ci, ti); + clone = clone_bio(bio, ci->sector, ci->idx, + bio->bi_vcnt - ci->idx, ci->sector_count, + ci->md->bs); + __map_bio(ti, clone, tio); ci->sector_count = 0; +} + +static int __clone_and_map_discard(struct clone_info *ci) +{ + struct dm_target *ti; + sector_t len; + + do { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; + + /* + * Even though the device advertised discard support, + * reconfiguration might have changed that since the + * check was performed. + */ + if (!ti->num_discard_requests) + return -EOPNOTSUPP; + + len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + + __issue_target_requests(ci, ti, ti->num_discard_requests, len); + + ci->sector += len; + } while (ci->sector_count -= len); return 0; } @@ -1208,30 +1196,21 @@ static int __clone_and_map(struct clone_info *ci) sector_t len = 0, max; struct dm_target_io *tio; - if (unlikely(bio_empty_barrier(bio))) - return __clone_and_map_empty_barrier(ci); + if (unlikely(bio->bi_rw & REQ_DISCARD)) + return __clone_and_map_discard(ci); ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->md, ci->sector, ti); - - /* - * Allocate a target io object. - */ - tio = alloc_tio(ci, ti); + max = max_io_len(ci->sector, ti); if (ci->sector_count <= max) { /* * Optimise for the simple case where we can do all of * the remaining io with a single clone. */ - clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count, - ci->md->bs); - __map_bio(ti, clone, tio); - ci->sector_count = 0; + __clone_and_map_simple(ci, ti); } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { /* @@ -1252,6 +1231,7 @@ static int __clone_and_map(struct clone_info *ci) len += bv_len; } + tio = alloc_tio(ci, ti); clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, ci->md->bs); __map_bio(ti, clone, tio); @@ -1274,13 +1254,12 @@ static int __clone_and_map(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->md, ci->sector, ti); - - tio = alloc_tio(ci, ti); + max = max_io_len(ci->sector, ti); } len = min(remaining, max); + tio = alloc_tio(ci, ti); clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + offset, len, ci->md->bs); @@ -1308,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.map = dm_get_live_table(md); if (unlikely(!ci.map)) { - if (!(bio->bi_rw & REQ_HARDBARRIER)) - bio_io_error(bio); - else - if (!md->barrier_error) - md->barrier_error = -EIO; + bio_io_error(bio); return; } ci.md = md; - ci.bio = bio; ci.io = alloc_io(md); ci.io->error = 0; atomic_set(&ci.io->io_count, 1); @@ -1325,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.io->md = md; spin_lock_init(&ci.io->endio_lock); ci.sector = bio->bi_sector; - ci.sector_count = bio_sectors(bio); - if (unlikely(bio_empty_barrier(bio))) - ci.sector_count = 1; ci.idx = bio->bi_idx; start_io_acct(ci.io); - while (ci.sector_count && !error) - error = __clone_and_map(&ci); + if (bio->bi_rw & REQ_FLUSH) { + ci.bio = &ci.md->flush_bio; + ci.sector_count = 0; + error = __clone_and_map_empty_flush(&ci); + /* dec_pending submits any data associated with flush */ + } else { + ci.bio = bio; + ci.sector_count = bio_sectors(bio); + while (ci.sector_count && !error) + error = __clone_and_map(&ci); + } /* drop the extra reference count */ dec_pending(ci.io, error); @@ -1362,7 +1342,7 @@ static int dm_merge_bvec(struct request_queue *q, /* * Find maximum amount of I/O that won't need splitting */ - max_sectors = min(max_io_len(md, bvm->bi_sector, ti), + max_sectors = min(max_io_len(bvm->bi_sector, ti), (sector_t) BIO_MAX_SECTORS); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; if (max_size < 0) @@ -1416,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio) part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); part_stat_unlock(); - /* - * If we're suspended or the thread is processing barriers - * we have to queue this io for later. - */ - if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || - unlikely(bio->bi_rw & REQ_HARDBARRIER)) { + /* if we're suspended, we have to queue this io for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { up_read(&md->io_lock); - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && - bio_rw(bio) == READA) { + if (bio_rw(bio) != READA) + queue_io(md, bio); + else bio_io_error(bio); - return 0; - } - - queue_io(md, bio); - return 0; } @@ -1462,14 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio) return _dm_request(q, bio); } -static bool dm_rq_is_flush_request(struct request *rq) -{ - if (rq->cmd_flags & REQ_FLUSH) - return true; - else - return false; -} - void dm_dispatch_request(struct request *rq) { int r; @@ -1517,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; - if (dm_rq_is_flush_request(rq)) { - blk_rq_init(NULL, clone); - clone->cmd_type = REQ_TYPE_FS; - clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); - } else { - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, - dm_rq_bio_constructor, tio); - if (r) - return r; - - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; - clone->buffer = rq->buffer; - } + r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + if (r) + return r; + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -1573,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) struct mapped_device *md = q->queuedata; struct request *clone; - if (unlikely(dm_rq_is_flush_request(rq))) - return BLKPREP_OK; - if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; @@ -1652,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q) struct dm_table *map = dm_get_live_table(md); struct dm_target *ti; struct request *rq, *clone; + sector_t pos; /* * For suspend, check blk_queue_stopped() and increment @@ -1664,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q) if (!rq) goto plug_and_out; - if (unlikely(dm_rq_is_flush_request(rq))) { - BUG_ON(md->flush_request); - md->flush_request = rq; - blk_start_request(rq); - queue_work(md->wq, &md->barrier_work); - goto out; - } + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + BUG_ON(!dm_target_is_valid(ti)); - ti = dm_table_find_target(map, blk_rq_pos(rq)); if (ti->type->busy && ti->type->busy(ti)) goto plug_and_out; @@ -1843,7 +1797,29 @@ out: static const struct block_device_operations dm_blk_dops; static void dm_wq_work(struct work_struct *work); -static void dm_rq_barrier_work(struct work_struct *work); + +static void dm_init_md_queue(struct mapped_device *md) +{ + /* + * Request-based dm devices cannot be stacked on top of bio-based dm + * devices. The type of this dm device has not been decided yet. + * The type is decided at the first table loading time. + * To prevent problematic device stacking, clear the queue flag + * for request stacking support until then. + * + * This queue is new, so no concurrency on the queue_flags. + */ + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + + md->queue->queuedata = md; + md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info.congested_data = md; + blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); + md->queue->unplug_fn = dm_unplug_all; + blk_queue_merge_bvec(md->queue, dm_merge_bvec); + blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); +} /* * Allocate and initialise a blank device with a given minor. @@ -1870,10 +1846,11 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_minor; + md->type = DM_TYPE_NONE; init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); + mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); - spin_lock_init(&md->barrier_error_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -1882,33 +1859,11 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->uevent_list); spin_lock_init(&md->uevent_lock); - md->queue = blk_init_queue(dm_request_fn, NULL); + md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) goto bad_queue; - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet, - * although we initialized the queue using blk_init_queue(). - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - md->saved_make_request_fn = md->queue->make_request_fn; - md->queue->queuedata = md; - md->queue->backing_dev_info.congested_fn = dm_any_congested; - md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - md->queue->unplug_fn = dm_unplug_all; - blk_queue_merge_bvec(md->queue, dm_merge_bvec); - blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); - blk_queue_lld_busy(md->queue, dm_lld_busy); - blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); + dm_init_md_queue(md); md->disk = alloc_disk(1); if (!md->disk) @@ -1918,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor) atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); - INIT_WORK(&md->barrier_work, dm_rq_barrier_work); init_waitqueue_head(&md->eventq); md->disk->major = _major; @@ -1938,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad_bdev; + bio_init(&md->flush_bio); + md->flush_bio.bi_bdev = md->bdev; + md->flush_bio.bi_rw = WRITE_FLUSH; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -2123,6 +2081,71 @@ int dm_create(int minor, struct mapped_device **result) return 0; } +/* + * Functions to manage md->type. + * All are required to hold md->type_lock. + */ +void dm_lock_md_type(struct mapped_device *md) +{ + mutex_lock(&md->type_lock); +} + +void dm_unlock_md_type(struct mapped_device *md) +{ + mutex_unlock(&md->type_lock); +} + +void dm_set_md_type(struct mapped_device *md, unsigned type) +{ + md->type = type; +} + +unsigned dm_get_md_type(struct mapped_device *md) +{ + return md->type; +} + +/* + * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + */ +static int dm_init_request_based_queue(struct mapped_device *md) +{ + struct request_queue *q = NULL; + + if (md->queue->elevator) + return 1; + + /* Fully initialize the queue */ + q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); + if (!q) + return 0; + + md->queue = q; + md->saved_make_request_fn = md->queue->make_request_fn; + dm_init_md_queue(md); + blk_queue_softirq_done(md->queue, dm_softirq_done); + blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_lld_busy(md->queue, dm_lld_busy); + + elv_register_queue(md->queue); + + return 1; +} + +/* + * Setup the DM device's queue based on md's type + */ +int dm_setup_md_queue(struct mapped_device *md) +{ + if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && + !dm_init_request_based_queue(md)) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return -EINVAL; + } + + return 0; +} + static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; @@ -2136,6 +2159,7 @@ static struct mapped_device *dm_find_md(dev_t dev) md = idr_find(&_minor_idr, minor); if (md && (md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || + dm_deleting_md(md) || test_bit(DMF_FREEING, &md->flags))) { md = NULL; goto out; @@ -2170,6 +2194,7 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr) void dm_get(struct mapped_device *md) { atomic_inc(&md->holders); + BUG_ON(test_bit(DMF_FREEING, &md->flags)); } const char *dm_device_name(struct mapped_device *md) @@ -2178,27 +2203,55 @@ const char *dm_device_name(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_device_name); -void dm_put(struct mapped_device *md) +static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; - BUG_ON(test_bit(DMF_FREEING, &md->flags)); + might_sleep(); - if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { - map = dm_get_live_table(md); - idr_replace(&_minor_idr, MINOR_ALLOCED, - MINOR(disk_devt(dm_disk(md)))); - set_bit(DMF_FREEING, &md->flags); - spin_unlock(&_minor_lock); - if (!dm_suspended_md(md)) { - dm_table_presuspend_targets(map); - dm_table_postsuspend_targets(map); - } - dm_sysfs_exit(md); - dm_table_put(map); - dm_table_destroy(__unbind(md)); - free_dev(md); + spin_lock(&_minor_lock); + map = dm_get_live_table(md); + idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); + + if (!dm_suspended_md(md)) { + dm_table_presuspend_targets(map); + dm_table_postsuspend_targets(map); } + + /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + + dm_sysfs_exit(md); + dm_table_put(map); + dm_table_destroy(__unbind(md)); + free_dev(md); +} + +void dm_destroy(struct mapped_device *md) +{ + __dm_destroy(md, true); +} + +void dm_destroy_immediate(struct mapped_device *md) +{ + __dm_destroy(md, false); +} + +void dm_put(struct mapped_device *md) +{ + atomic_dec(&md->holders); } EXPORT_SYMBOL_GPL(dm_put); @@ -2233,38 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) return r; } -static void dm_flush(struct mapped_device *md) -{ - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - - bio_init(&md->barrier_bio); - md->barrier_bio.bi_bdev = md->bdev; - md->barrier_bio.bi_rw = WRITE_BARRIER; - __split_and_process_bio(md, &md->barrier_bio); - - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); -} - -static void process_barrier(struct mapped_device *md, struct bio *bio) -{ - md->barrier_error = 0; - - dm_flush(md); - - if (!bio_empty_barrier(bio)) { - __split_and_process_bio(md, bio); - dm_flush(md); - } - - if (md->barrier_error != DM_ENDIO_REQUEUE) - bio_endio(bio, md->barrier_error); - else { - spin_lock_irq(&md->deferred_lock); - bio_list_add_head(&md->deferred, bio); - spin_unlock_irq(&md->deferred_lock); - } -} - /* * Process the deferred bios */ @@ -2274,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work) work); struct bio *c; - down_write(&md->io_lock); + down_read(&md->io_lock); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); c = bio_list_pop(&md->deferred); spin_unlock_irq(&md->deferred_lock); - if (!c) { - clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); + if (!c) break; - } - up_write(&md->io_lock); + up_read(&md->io_lock); if (dm_request_based(md)) generic_make_request(c); - else { - if (c->bi_rw & REQ_HARDBARRIER) - process_barrier(md, c); - else - __split_and_process_bio(md, c); - } + else + __split_and_process_bio(md, c); - down_write(&md->io_lock); + down_read(&md->io_lock); } - up_write(&md->io_lock); + up_read(&md->io_lock); } static void dm_queue_flush(struct mapped_device *md) @@ -2310,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md) queue_work(md->wq, &md->work); } -static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - - tio->info.flush_request = flush_nr; -} - -/* Issue barrier requests to targets and wait for their completion. */ -static int dm_rq_barrier(struct mapped_device *md) -{ - int i, j; - struct dm_table *map = dm_get_live_table(md); - unsigned num_targets = dm_table_get_num_targets(map); - struct dm_target *ti; - struct request *clone; - - md->barrier_error = 0; - - for (i = 0; i < num_targets; i++) { - ti = dm_table_get_target(map, i); - for (j = 0; j < ti->num_flush_requests; j++) { - clone = clone_rq(md->flush_request, md, GFP_NOIO); - dm_rq_set_flush_nr(clone, j); - atomic_inc(&md->pending[rq_data_dir(clone)]); - map_request(ti, clone, md); - } - } - - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - dm_table_put(map); - - return md->barrier_error; -} - -static void dm_rq_barrier_work(struct work_struct *work) -{ - int error; - struct mapped_device *md = container_of(work, struct mapped_device, - barrier_work); - struct request_queue *q = md->queue; - struct request *rq; - unsigned long flags; - - /* - * Hold the md reference here and leave it at the last part so that - * the md can't be deleted by device opener when the barrier request - * completes. - */ - dm_get(md); - - error = dm_rq_barrier(md); - - rq = md->flush_request; - md->flush_request = NULL; - - if (error == DM_ENDIO_REQUEUE) { - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, rq); - spin_unlock_irqrestore(q->queue_lock, flags); - } else - blk_end_request_all(rq, error); - - blk_run_queue(q); - - dm_put(md); -} - /* * Swap in a new table, returning the old one for the caller to destroy. */ @@ -2398,13 +2346,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; } - /* cannot change the device type, once a table is bound */ - if (md->map && - (dm_table_get_type(md->map) != dm_table_get_type(table))) { - DMWARN("can't change the device type after a table is bound"); - goto out; - } - map = __bind(md, table, &limits); out: @@ -2506,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * * To get all processes out of __split_and_process_bio in dm_request, * we take the write lock. To prevent any process from reentering - * __split_and_process_bio from dm_request, we set - * DMF_QUEUE_IO_TO_THREAD. - * - * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND - * and call flush_workqueue(md->wq). flush_workqueue will wait until - * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any - * further calls to __split_and_process_bio from dm_wq_work. + * __split_and_process_bio from dm_request and quiesce the thread + * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call + * flush_workqueue(md->wq). */ down_write(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); up_write(&md->io_lock); /* - * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which - * can be kicked until md->queue is stopped. So stop md->queue before - * flushing md->wq. + * Stop md->queue before flushing md->wq in case request-based + * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) stop_queue(md->queue); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index bad1724d486..0c2dd5f4af7 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -59,13 +59,20 @@ void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); int dm_table_any_busy_target(struct dm_table *t); -int dm_table_set_type(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_discards(struct dm_table *t); int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); + +int dm_setup_md_queue(struct mapped_device *md); + /* * To check the return value from dm_table_find_target(). */ @@ -122,6 +129,11 @@ void dm_linear_exit(void); int dm_stripe_init(void); void dm_stripe_exit(void); +/* + * mapped_device operations + */ +void dm_destroy(struct mapped_device *md); +void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index ba19060bcf3..8a2f767f26d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio) dev_info_t *tmp_dev; sector_t start_sector; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 11567c7999a..225815197a3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -36,7 +36,7 @@ #include <linux/blkdev.h> #include <linux/sysctl.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/poll.h> #include <linux/ctype.h> @@ -57,6 +57,7 @@ #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) +static DEFINE_MUTEX(md_mutex); #ifndef MODULE static void autostart_arrays(int part); @@ -226,12 +227,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) return 0; } rcu_read_lock(); - if (mddev->suspended || mddev->barrier) { + if (mddev->suspended) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended && !mddev->barrier) + if (!mddev->suspended) break; rcu_read_unlock(); schedule(); @@ -282,40 +283,29 @@ EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(mddev_t *mddev, int bits) { - if (mddev->barrier) - return 1; return mddev->suspended; } EXPORT_SYMBOL(mddev_congested); /* - * Generic barrier handling for md + * Generic flush handling for md */ -#define POST_REQUEST_BARRIER ((void*)1) - -static void md_end_barrier(struct bio *bio, int err) +static void md_end_flush(struct bio *bio, int err) { mdk_rdev_t *rdev = bio->bi_private; mddev_t *mddev = rdev->mddev; - if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) - set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { - if (mddev->barrier == POST_REQUEST_BARRIER) { - /* This was a post-request barrier */ - mddev->barrier = NULL; - wake_up(&mddev->sb_wait); - } else - /* The pre-request barrier has finished */ - schedule_work(&mddev->barrier_work); + /* The pre-request flush has finished */ + schedule_work(&mddev->flush_work); } bio_put(bio); } -static void submit_barriers(mddev_t *mddev) +static void submit_flushes(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -332,60 +322,56 @@ static void submit_barriers(mddev_t *mddev) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc(GFP_KERNEL, 0); - bi->bi_end_io = md_end_barrier; + bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_BARRIER, bi); + submit_bio(WRITE_FLUSH, bi); rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); } -static void md_submit_barrier(struct work_struct *ws) +static void md_submit_flush_data(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, barrier_work); - struct bio *bio = mddev->barrier; + mddev_t *mddev = container_of(ws, mddev_t, flush_work); + struct bio *bio = mddev->flush_bio; atomic_set(&mddev->flush_pending, 1); - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - bio_endio(bio, -EOPNOTSUPP); - else if (bio->bi_size == 0) + if (bio->bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); else { - bio->bi_rw &= ~REQ_HARDBARRIER; + bio->bi_rw &= ~REQ_FLUSH; if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); - mddev->barrier = POST_REQUEST_BARRIER; - submit_barriers(mddev); } if (atomic_dec_and_test(&mddev->flush_pending)) { - mddev->barrier = NULL; + mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); } } -void md_barrier_request(mddev_t *mddev, struct bio *bio) +void md_flush_request(mddev_t *mddev, struct bio *bio) { spin_lock_irq(&mddev->write_lock); wait_event_lock_irq(mddev->sb_wait, - !mddev->barrier, + !mddev->flush_bio, mddev->write_lock, /*nothing*/); - mddev->barrier = bio; + mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); atomic_set(&mddev->flush_pending, 1); - INIT_WORK(&mddev->barrier_work, md_submit_barrier); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); - submit_barriers(mddev); + submit_flushes(mddev); if (atomic_dec_and_test(&mddev->flush_pending)) - schedule_work(&mddev->barrier_work); + schedule_work(&mddev->flush_work); } -EXPORT_SYMBOL(md_barrier_request); +EXPORT_SYMBOL(md_flush_request); /* Support for plugging. * This mirrors the plugging support in request_queue, but does not @@ -696,31 +682,6 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -static void super_written_barrier(struct bio *bio, int error) -{ - struct bio *bio2 = bio->bi_private; - mdk_rdev_t *rdev = bio2->bi_private; - mddev_t *mddev = rdev->mddev; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - error == -EOPNOTSUPP) { - unsigned long flags; - /* barriers don't appear to be supported :-( */ - set_bit(BarriersNotsupp, &rdev->flags); - mddev->barriers_work = 0; - spin_lock_irqsave(&mddev->write_lock, flags); - bio2->bi_next = mddev->biolist; - mddev->biolist = bio2; - spin_unlock_irqrestore(&mddev->write_lock, flags); - wake_up(&mddev->sb_wait); - bio_put(bio); - } else { - bio_put(bio2); - bio->bi_private = rdev; - super_written(bio, error); - } -} - void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page) { @@ -729,51 +690,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error - * - * As we might need to resubmit the request if REQ_HARDBARRIER - * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG; bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); - if (!test_bit(BarriersNotsupp, &rdev->flags)) { - struct bio *rbio; - rw |= REQ_HARDBARRIER; - rbio = bio_clone(bio, GFP_NOIO); - rbio->bi_private = bio; - rbio->bi_end_io = super_written_barrier; - submit_bio(rw, rbio); - } else - submit_bio(rw, bio); + submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, + bio); } void md_super_wait(mddev_t *mddev) { - /* wait for all superblock writes that were scheduled to complete. - * if any had to be retried (due to BARRIER problems), retry them - */ + /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; - while (mddev->biolist) { - struct bio *bio; - spin_lock_irq(&mddev->write_lock); - bio = mddev->biolist; - mddev->biolist = bio->bi_next ; - bio->bi_next = NULL; - spin_unlock_irq(&mddev->write_lock); - submit_bio(bio->bi_rw, bio); - } schedule(); } finish_wait(&mddev->sb_wait, &wq); @@ -1070,7 +1008,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; @@ -1485,7 +1422,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; @@ -1643,7 +1579,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; - } + } else + max_dev = le32_to_cpu(sb->max_dev); + for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -2136,16 +2074,6 @@ static void sync_sbs(mddev_t * mddev, int nospares) * with the rest of the array) */ mdk_rdev_t *rdev; - - /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(In_sync, &rdev->flags) && - mddev->curr_resync_completed > rdev->recovery_offset) - rdev->recovery_offset = mddev->curr_resync_completed; - - } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->sb_events == mddev->events || (nospares && @@ -2167,13 +2095,27 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; - mddev->utime = get_seconds(); - if (mddev->external) - return; repeat: + /* First make sure individual recovery_offsets are correct */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_DEVS, &mddev->flags); + wake_up(&mddev->sb_wait); + return; + } + spin_lock_irq(&mddev->write_lock); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + mddev->utime = get_seconds(); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) @@ -2221,19 +2163,6 @@ repeat: MD_BUG(); mddev->events --; } - - /* - * do not write anything to disk if using - * nonpersistent superblocks - */ - if (!mddev->persistent) { - if (!mddev->external) - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - - spin_unlock_irq(&mddev->write_lock); - wake_up(&mddev->sb_wait); - return; - } sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); @@ -3379,7 +3308,7 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; - else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) st = write_pending; else if (mddev->safemode) st = active_idle; @@ -3460,9 +3389,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, - &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } err = 0; } else @@ -3474,8 +3401,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - if (mddev->external) - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -4514,7 +4440,6 @@ int md_run(mddev_t *mddev) /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; - mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly && mddev->ro == 0) @@ -4693,7 +4618,6 @@ static void md_clean(mddev_t *mddev) mddev->recovery = 0; mddev->in_sync = 0; mddev->degraded = 0; - mddev->barriers_work = 0; mddev->safemode = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; @@ -5961,7 +5885,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) mddev_t *mddev = mddev_find(bdev->bd_dev); int err; - lock_kernel(); + mutex_lock(&md_mutex); if (mddev->gendisk != bdev->bd_disk) { /* we are racing with mddev_put which is discarding this * bd_disk. @@ -5970,7 +5894,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) /* Wait until bdev->bd_disk is definitely gone */ flush_scheduled_work(); /* Then retry the open from the top */ - unlock_kernel(); + mutex_unlock(&md_mutex); return -ERESTARTSYS; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -5984,7 +5908,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) check_disk_size_change(mddev->gendisk, bdev); out: - unlock_kernel(); + mutex_unlock(&md_mutex); return err; } @@ -5993,10 +5917,10 @@ static int md_release(struct gendisk *disk, fmode_t mode) mddev_t *mddev = disk->private_data; BUG_ON(!mddev); - lock_kernel(); + mutex_lock(&md_mutex); atomic_dec(&mddev->openers); mddev_put(mddev); - unlock_kernel(); + mutex_unlock(&md_mutex); return 0; } @@ -6580,6 +6504,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); md_wakeup_thread(mddev->thread); did_change = 1; } @@ -6588,7 +6513,6 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } @@ -6624,6 +6548,7 @@ int md_allow_write(mddev_t *mddev) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; @@ -6633,7 +6558,7 @@ int md_allow_write(mddev_t *mddev) } else spin_unlock_irq(&mddev->write_lock); - if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; else return 0; @@ -6831,8 +6756,7 @@ void md_do_sync(mddev_t *mddev) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = mddev->curr_resync; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -7081,7 +7005,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags && !mddev->external) || + (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -7111,8 +7035,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; did_change = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index a953fe2808a..112a2c32db0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -87,7 +87,6 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ -#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */ #define AllReserved 6 /* If whole device is reserved for * one array */ #define AutoDetected 7 /* added by auto-detect */ @@ -140,7 +139,7 @@ struct mddev_s unsigned long flags; #define MD_CHANGE_DEVS 0 /* Some device status has changed */ #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ -#define MD_CHANGE_PENDING 2 /* superblock update in progress */ +#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ int suspended; atomic_t active_io; @@ -273,13 +272,6 @@ struct mddev_s int degraded; /* whether md should consider * adding a spare */ - int barriers_work; /* initialised to true, cleared as soon - * as a barrier request to slave - * fails. Only supported - */ - struct bio *biolist; /* bios that need to be retried - * because REQ_HARDBARRIER is not supported - */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; @@ -339,16 +331,13 @@ struct mddev_s struct attribute_group *to_remove; struct plug_handle *plug; /* if used by personality */ - /* Generic barrier handling. - * If there is a pending barrier request, all other - * writes are blocked while the devices are flushed. - * The last to finish a flush schedules a worker to - * submit the barrier request (without the barrier flag), - * then submit more flush requests. + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_FLUSH flag). */ - struct bio *barrier; + struct bio *flush_bio; atomic_t flush_pending; - struct work_struct barrier_work; + struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ }; @@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); extern int mddev_congested(mddev_t *mddev, int bits); -extern void md_barrier_request(mddev_t *mddev, struct bio *bio); +extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 0307d217e7a..6d7ddf32ef2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f7af46d623..a39f4c355e5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio) struct strip_zone *zone; mdk_rdev_t *tmp_dev; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 73cc74ffc26..378a25894c5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) if (r1_bio->bios[mirror] == bio) break; - if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { - set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_BarrierRetry, &r1_bio->state); - r1_bio->mddev->barriers_work = 0; - /* Don't rdev_dec_pending in this branch - keep it for the retry */ - } else { + /* + * 'one mirror IO has finished' event handler: + */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + if (!uptodate) { + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* - * this branch is our 'one mirror IO has finished' event handler: + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - if (!uptodate) { - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } else - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); - - if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) - atomic_dec(&r1_bio->behind_remaining); - - /* In behind mode, we ACK the master bio once the I/O has safely - * reached all non-writemostly disks. Setting the Returned bit - * ensures that this gets done only once -- we don't ever want to - * return -EIO here, instead we'll wait */ - - if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && - test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* Maybe we can return now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - struct bio *mbio = r1_bio->master_bio; - PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); - bio_endio(mbio, 0); - } + set_bit(R1BIO_Uptodate, &r1_bio->state); + + update_head_pos(mirror, r1_bio); + + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, 0); } } - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + /* - * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + safe_put_page(bio->bi_io_vec[i].bv_page); } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } if (to_put) @@ -787,17 +778,14 @@ static int make_request(mddev_t *mddev, struct bio * bio) struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - const bool do_sync = (bio->bi_rw & REQ_SYNC); - bool do_barriers; + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. - * We test barriers_work *after* md_write_start as md_write_start - * may cause the first superblock write, and that will check out - * if barriers work. */ md_write_start(mddev, bio); /* wait on superblock update early */ @@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) } finish_wait(&conf->wait_barrier, &w); } - if (unlikely(!mddev->barriers_work && - (bio->bi_rw & REQ_HARDBARRIER))) { - if (rw == WRITE) - md_write_end(mddev); - bio_endio(bio, -EOPNOTSUPP); - return 0; - } wait_barrier(conf); @@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio->bi_rw & REQ_HARDBARRIER; - if (do_barriers) - set_bit(R1BIO_Barrier, &r1_bio->state); - bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; @@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -1120,6 +1097,8 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; + int count = 0; + unsigned long flags; /* * Find all failed disks within the RAID1 configuration @@ -1131,15 +1110,16 @@ static int raid1_spare_active(mddev_t *mddev) if (rdev && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent(rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); - return 0; + return count; } @@ -1631,41 +1611,6 @@ static void raid1d(mddev_t *mddev) if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; - } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { - /* some requests in the r1bio were REQ_HARDBARRIER - * requests which failed with -EOPNOTSUPP. Hohumm.. - * Better resubmit without the barrier. - * We know which devices to resubmit for, because - * all others have had their bios[] entry cleared. - * We already have a nr_pending reference on these rdevs. - */ - int i; - const bool do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); - clear_bit(R1BIO_BarrierRetry, &r1_bio->state); - clear_bit(R1BIO_Barrier, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) - atomic_inc(&r1_bio->remaining); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) { - struct bio_vec *bvec; - int j; - - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - /* copy pages from the failed bio, as - * this might be a write-behind device */ - __bio_for_each_segment(bvec, bio, j, 0) - bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; - bio_put(r1_bio->bios[i]); - bio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | do_sync; - bio->bi_private = r1_bio; - r1_bio->bios[i] = bio; - generic_make_request(bio); - } } else { int disk; @@ -1696,7 +1641,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - const bool do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; + const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; @@ -1836,7 +1781,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* take from bio_init */ bio->bi_next = NULL; + bio->bi_flags &= ~(BIO_POOL_MASK-1); bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; @@ -1909,7 +1856,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; BUG_ON(sync_blocks < (PAGE_SIZE>>9)); - if (len > (sync_blocks<<9)) + if ((len >> 9) > sync_blocks) len = sync_blocks<<9; } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5f2d443ae28..adf8cfd7331 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -117,8 +117,6 @@ struct r1bio_s { #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 #define R1BIO_BehindIO 3 -#define R1BIO_Barrier 4 -#define R1BIO_BarrierRetry 5 /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a88aeb5198c..f0d082f749b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -799,13 +799,14 @@ static int make_request(mddev_t *mddev, struct bio * bio) int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); - const bool do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_fua = (bio->bi_rw & REQ_FUA); struct bio_list bl; unsigned long flags; mdk_rdev_t *blocked_rdev; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } @@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync; + mbio->bi_rw = WRITE | do_sync | do_fua; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1116,6 +1117,8 @@ static int raid10_spare_active(mddev_t *mddev) int i; conf_t *conf = mddev->private; mirror_info_t *tmp; + int count = 0; + unsigned long flags; /* * Find all non-in_sync disks within the RAID10 configuration @@ -1126,15 +1129,16 @@ static int raid10_spare_active(mddev_t *mddev) if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent(tmp->rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); - return 0; + return count; } @@ -1734,7 +1738,7 @@ static void raid10d(mddev_t *mddev) raid_end_bio_io(r10_bio); bio_put(bio); } else { - const bool do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 866d4b5a144..31140d1259d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) int rw; struct bio *bi; mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) + rw = WRITE_FUA; + else + rw = WRITE; + } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else continue; @@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + if (wbi->bi_rw & REQ_FUA) + set_bit(R5_WantFUA, &dev->flags); tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; + bool fua = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = disks; i--; ) + fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written || i == pd_idx || i == qd_idx) + if (dev->written || i == pd_idx || i == qd_idx) { set_bit(R5_UPTODATE, &dev->flags); + if (fua) + set_bit(R5_WantFUA, &dev->flags); + } } if (sh->reconstruct_state == reconstruct_state_drain_run) @@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh) if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request - * is waiting on a barrier, it won't continue until the writes + * is waiting on a flush, it won't continue until the writes * have actually been submitted. */ atomic_dec(&conf->preread_active_stripes); @@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh) if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request - * is waiting on a barrier, it won't continue until the writes + * is waiting on a flush, it won't continue until the writes * have actually been submitted. */ atomic_dec(&conf->preread_active_stripes); @@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi) const int rw = bio_data_dir(bi); int remaining; - if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { - /* Drain all pending writes. We only really need - * to ensure they have been submitted, but this is - * easier. - */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - md_barrier_request(mddev, bi); + if (unlikely(bi->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bi); return 0; } @@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - if (mddev->barrier && + if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe(sh); @@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi) bio_endio(bi, 0); } - if (mddev->barrier) { - /* We need to wait for the stripes to all be handled. - * So: wait for preread_active_stripes to drop to 0. - */ - wait_event(mddev->thread->wqueue, - atomic_read(&conf->preread_active_stripes) == 0); - } return 0; } @@ -5330,6 +5329,8 @@ static int raid5_spare_active(mddev_t *mddev) int i; raid5_conf_t *conf = mddev->private; struct disk_info *tmp; + int count = 0; + unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; @@ -5337,14 +5338,15 @@ static int raid5_spare_active(mddev_t *mddev) && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent(tmp->rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); - return 0; + return count; } static int raid5_remove_disk(mddev_t *mddev, int number) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 36eaed5dfd6..2ace0582b40 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -275,6 +275,7 @@ struct r6_state { * filling */ #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +#define R5_WantFUA 14 /* Write should be FUA */ /* * Write method */ |