diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-08-02 20:49:21 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-08-02 20:49:21 -1000 |
commit | f3406816bb2486fc44558bec77179cd9bcbd4450 (patch) | |
tree | 718db1ef45e55314b5e7290f77e70e6328d855a4 | |
parent | 4400478ba3d939b680810aa004f1e954b4f8ba16 (diff) | |
parent | ed8b752bccf2560e305e25125721d2f0ac759e88 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (34 commits)
dm table: set flush capability based on underlying devices
dm crypt: optionally support discard requests
dm raid: add md raid1 support
dm raid: support metadata devices
dm raid: add write_mostly parameter
dm raid: add region_size parameter
dm raid: improve table parameters documentation
dm ioctl: forbid multiple device specifiers
dm ioctl: introduce __get_dev_cell
dm ioctl: fill in device parameters in more ioctls
dm flakey: add corrupt_bio_byte feature
dm flakey: add drop_writes
dm flakey: support feature args
dm flakey: use dm_target_offset and support discards
dm table: share target argument parsing functions
dm snapshot: skip reading origin when overwriting complete chunk
dm: ignore merge_bvec for snapshots when safe
dm table: clean dm_get_device and move exports
dm raid: tidy includes
dm ioctl: prevent empty message
...
-rw-r--r-- | Documentation/device-mapper/dm-crypt.txt | 21 | ||||
-rw-r--r-- | Documentation/device-mapper/dm-flakey.txt | 48 | ||||
-rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 138 | ||||
-rw-r--r-- | drivers/md/Kconfig | 5 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 62 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 270 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 89 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 42 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-base.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 32 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 147 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 621 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 80 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 84 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 155 | ||||
-rw-r--r-- | drivers/md/dm.c | 75 | ||||
-rw-r--r-- | drivers/md/dm.h | 2 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 43 | ||||
-rw-r--r-- | include/linux/dm-ioctl.h | 4 | ||||
-rw-r--r-- | include/linux/dm-kcopyd.h | 15 |
21 files changed, 1561 insertions, 404 deletions
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 6b5c42dbbe8..2c656ae43ba 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -4,7 +4,8 @@ dm-crypt Device-Mapper's "crypt" target provides transparent encryption of block devices using the kernel crypto API. -Parameters: <cipher> <key> <iv_offset> <device path> <offset> +Parameters: <cipher> <key> <iv_offset> <device path> \ + <offset> [<#opt_params> <opt_params>] <cipher> Encryption cipher and an optional IV generation mode. @@ -37,6 +38,24 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset> <offset> Starting sector within the device where the encrypted data begins. +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 1 allow_discards + +allow_discards + Block discard requests (a.k.a. TRIM) are passed through the crypt device. + The default is to ignore discard requests. + + WARNING: Assess the specific security risks carefully before enabling this + option. For example, allowing discards on encrypted devices may lead to + the leak of information about the ciphertext device (filesystem type, + used space etc.) if the discarded blocks can be located easily on the + device later. + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt index c8efdfd19a6..6ff5c232722 100644 --- a/Documentation/device-mapper/dm-flakey.txt +++ b/Documentation/device-mapper/dm-flakey.txt @@ -1,17 +1,53 @@ dm-flakey ========= -This target is the same as the linear target except that it returns I/O -errors periodically. It's been found useful in simulating failing -devices for testing purposes. +This target is the same as the linear target except that it exhibits +unreliable behaviour periodically. It's been found useful in simulating +failing devices for testing purposes. Starting from the time the table is loaded, the device is available for -<up interval> seconds, then returns errors for <down interval> seconds, -and then this cycle repeats. +<up interval> seconds, then exhibits unreliable behaviour for <down +interval> seconds, and then this cycle repeats. -Parameters: <dev path> <offset> <up interval> <down interval> +Also, consider using this in combination with the dm-delay target too, +which can delay reads and writes and/or send them to different +underlying devices. + +Table parameters +---------------- + <dev path> <offset> <up interval> <down interval> \ + [<num_features> [<feature arguments>]] + +Mandatory parameters: <dev path>: Full pathname to the underlying block-device, or a "major:minor" device-number. <offset>: Starting sector within the device. <up interval>: Number of seconds device is available. <down interval>: Number of seconds device returns errors. + +Optional feature parameters: + If no feature parameters are present, during the periods of + unreliability, all I/O returns errors. + + drop_writes: + All write I/O is silently ignored. + Read I/O is handled correctly. + + corrupt_bio_byte <Nth_byte> <direction> <value> <flags>: + During <down interval>, replace <Nth_byte> of the data of + each matching bio with <value>. + + <Nth_byte>: The offset of the byte to replace. + Counting starts at 1, to replace the first byte. + <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes. + 'w' is incompatible with drop_writes. + <value>: The value (from 0-255) to write. + <flags>: Perform the replacement only if bio->bi_rw has all the + selected flags set. + +Examples: + corrupt_bio_byte 32 r 1 0 + - replaces the 32nd byte of READ bios with the value 1 + + corrupt_bio_byte 224 w 0 32 + - replaces the 224th byte of REQ_META (=32) bios with the value 0 diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 33b6b7071ac..2a8c11331d2 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -1,70 +1,108 @@ -Device-mapper RAID (dm-raid) is a bridge from DM to MD. It -provides a way to use device-mapper interfaces to access the MD RAID -drivers. +dm-raid +------- -As with all device-mapper targets, the nominal public interfaces are the -constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO -and STATUSTYPE_TABLE). The CTR table looks like the following: +The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. +It allows the MD RAID drivers to be accessed using a device-mapper +interface. -1: <s> <l> raid \ -2: <raid_type> <#raid_params> <raid_params> \ -3: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN> - -Line 1 contains the standard first three arguments to any device-mapper -target - the start, length, and target type fields. The target type in -this case is "raid". - -Line 2 contains the arguments that define the particular raid -type/personality/level, the required arguments for that raid type, and -any optional arguments. Possible raid types include: raid4, raid5_la, -raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is -planned for the future.) The list of required and optional parameters -is the same for all the current raid types. The required parameters are -positional, while the optional parameters are given as key/value pairs. -The possible parameters are as follows: - <chunk_size> Chunk size in sectors. - [[no]sync] Force/Prevent RAID initialization - [rebuild <idx>] Rebuild the drive indicated by the index - [daemon_sleep <ms>] Time between bitmap daemon work to clear bits - [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization - [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization - [max_write_behind <sectors>] See '-write-behind=' (man mdadm) - [stripe_cache <sectors>] Stripe cache size for higher RAIDs - -Line 3 contains the list of devices that compose the array in -metadata/data device pairs. If the metadata is stored separately, a '-' -is given for the metadata device position. If a drive has failed or is -missing at creation time, a '-' can be given for both the metadata and -data drives for a given position. - -NB. Currently all metadata devices must be specified as '-'. - -Examples: -# RAID4 - 4 data drives, 1 parity +The target is named "raid" and it accepts the following parameters: + + <raid_type> <#raid_params> <raid_params> \ + <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>] + +<raid_type>: + raid1 RAID1 mirroring + raid4 RAID4 dedicated parity disk + raid5_la RAID5 left asymmetric + - rotating parity 0 with data continuation + raid5_ra RAID5 right asymmetric + - rotating parity N with data continuation + raid5_ls RAID5 left symmetric + - rotating parity 0 with data restart + raid5_rs RAID5 right symmetric + - rotating parity N with data restart + raid6_zr RAID6 zero restart + - rotating parity zero (left-to-right) with data restart + raid6_nr RAID6 N restart + - rotating parity N (right-to-left) with data restart + raid6_nc RAID6 N continue + - rotating parity N (right-to-left) with data continuation + + Refererence: Chapter 4 of + http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf + +<#raid_params>: The number of parameters that follow. + +<raid_params> consists of + Mandatory parameters: + <chunk_size>: Chunk size in sectors. This parameter is often known as + "stripe size". It is the only mandatory parameter and + is placed first. + + followed by optional parameters (in any order): + [sync|nosync] Force or prevent RAID initialization. + + [rebuild <idx>] Rebuild drive number idx (first drive is 0). + + [daemon_sleep <ms>] + Interval between runs of the bitmap daemon that + clear bits. A longer interval means less bitmap I/O but + resyncing after a failure is likely to take longer. + + [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + [write_mostly <idx>] Drive index is write-mostly + [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + [stripe_cache <sectors>] Stripe cache size (higher RAIDs only) + [region_size <sectors>] + The region_size multiplied by the number of regions is the + logical size of the array. The bitmap records the device + synchronisation state for each region. + +<#raid_devs>: The number of devices composing the array. + Each device consists of two entries. The first is the device + containing the metadata (if any); the second is the one containing the + data. + + If a drive has failed or is missing at creation time, a '-' can be + given for both the metadata and data drives for a given position. + + +Example tables +-------------- +# RAID4 - 4 data drives, 1 parity (no metadata devices) # No metadata devices specified to hold superblock/bitmap info # Chunk size of 1MiB # (Lines separated for easy reading) + 0 1960893648 raid \ raid4 1 2048 \ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 -# RAID4 - 4 data drives, 1 parity (no metadata devices) +# RAID4 - 4 data drives, 1 parity (with metadata devices) # Chunk size of 1MiB, force RAID initialization, # min recovery rate at 20 kiB/sec/disk + 0 1960893648 raid \ - raid4 4 2048 min_recovery_rate 20 sync\ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 -Performing a 'dmsetup table' should display the CTR table used to -construct the mapping (with possible reordering of optional -parameters). +'dmsetup table' displays the table used to construct the mapping. +The optional parameters are always printed in the order listed +above with "sync" or "nosync" always output ahead of the other +arguments, regardless of the order used when originally loading the table. +Arguments that can be repeated are ordered by value. -Performing a 'dmsetup status' will yield information on the state and -health of the array. The output is as follows: +'dmsetup status' yields information on the state and health of the +array. +The output is as follows: 1: <s> <l> raid \ 2: <raid_type> <#devices> <1 health char for each dev> <resync_ratio> -Line 1 is standard DM output. Line 2 is best shown by example: +Line 1 is the standard output produced by device-mapper. +Line 2 is produced by the raid target, and best explained by example: 0 1960893648 raid raid4 5 AAAAA 2/490221568 Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with recovery. +Faulty or missing devices are marked 'D'. Devices that are out-of-sync +are marked 'a'. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8420129fc5e..f75a66e7d31 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -241,12 +241,13 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 4/5/6 target (EXPERIMENTAL)" + tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID1 select MD_RAID456 select BLK_DEV_MD ---help--- - A dm target that supports RAID4, RAID5 and RAID6 mappings + A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index bae6c4e23d3..49da55c1528 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -30,7 +30,6 @@ #include <linux/device-mapper.h> #define DM_MSG_PREFIX "crypt" -#define MESG_STR(x) x, sizeof(x) /* * context holding the current state of a multi-part conversion @@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); + *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } @@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); crypto_cipher_encrypt_one(essiv_tfm, iv, iv); return 0; @@ -1575,11 +1574,17 @@ bad_mem: static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - unsigned int key_size; + unsigned int key_size, opt_params; unsigned long long tmpll; int ret; + struct dm_arg_set as; + const char *opt_string; + + static struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; - if (argc != 5) { + if (argc < 5) { ti->error = "Not enough arguments"; return -EINVAL; } @@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; + argv += 5; + argc -= 5; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; + + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + goto bad; + + opt_string = dm_shift_arg(&as); + + if (opt_params == 1 && opt_string && + !strcasecmp(opt_string, "allow_discards")) + ti->num_discard_requests = 1; + else if (opt_params) { + ret = -EINVAL; + ti->error = "Invalid feature arguments"; + goto bad; + } + } + ret = -ENOMEM; cc->io_queue = alloc_workqueue("kcryptd_io", WQ_NON_REENTRANT| @@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct dm_crypt_io *io; struct crypt_config *cc; - if (bio->bi_rw & REQ_FLUSH) { + /* + * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. + * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight + * - for REQ_DISCARD caller must use flush if IO ordering matters + */ + if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { cc = ti->private; bio->bi_bdev = cc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); return DM_MAPIO_REMAPPED; } @@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, cc->dev->name, (unsigned long long)cc->start); + + if (ti->num_discard_requests) + DMEMIT(" 1 allow_discards"); + break; } return 0; @@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) if (argc < 2) goto error; - if (!strnicmp(argv[0], MESG_STR("key"))) { + if (!strcasecmp(argv[0], "key")) { if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { DMWARN("not suspended during key manipulation."); return -EINVAL; } - if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { + if (argc == 3 && !strcasecmp(argv[1], "set")) { ret = crypt_set_key(cc, argv[2]); if (ret) return ret; @@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) ret = cc->iv_gen_ops->init(cc); return ret; } - if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { + if (argc == 2 && !strcasecmp(argv[1], "wipe")) { if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { ret = cc->iv_gen_ops->wipe(cc); if (ret) @@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index ea790623c30..89f73ca22cf 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2003 Sistina Software (UK) Limited. - * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -15,6 +15,9 @@ #define DM_MSG_PREFIX "flakey" +#define all_corrupt_bio_flags_match(bio, fc) \ + (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + /* * Flakey: Used for testing only, simulates intermittent, * catastrophic device failure. @@ -25,60 +28,189 @@ struct flakey_c { sector_t start; unsigned up_interval; unsigned down_interval; + unsigned long flags; + unsigned corrupt_bio_byte; + unsigned corrupt_bio_rw; + unsigned corrupt_bio_value; + unsigned corrupt_bio_flags; +}; + +enum feature_flag_bits { + DROP_WRITES }; +static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + {1, UINT_MAX, "Invalid corrupt bio byte"}, + {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, + {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + }; + + /* No feature arguments supplied. */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + /* + * drop_writes + */ + if (!strcasecmp(arg_name, "drop_writes")) { + if (test_and_set_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes duplicated"; + return -EINVAL; + } + + continue; + } + + /* + * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> + */ + if (!strcasecmp(arg_name, "corrupt_bio_byte")) { + if (!argc) + ti->error = "Feature corrupt_bio_byte requires parameters"; + + r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); + if (r) + return r; + argc--; + + /* + * Direction r or w? + */ + arg_name = dm_shift_arg(as); + if (!strcasecmp(arg_name, "w")) + fc->corrupt_bio_rw = WRITE; + else if (!strcasecmp(arg_name, "r")) + fc->corrupt_bio_rw = READ; + else { + ti->error = "Invalid corrupt bio direction (r or w)"; + return -EINVAL; + } + argc--; + + /* + * Value of byte (0-255) to write in place of correct one. + */ + r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); + if (r) + return r; + argc--; + + /* + * Only corrupt bios with these flags set. + */ + r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); + if (r) + return r; + argc--; + + continue; + } + + ti->error = "Unrecognised flakey feature requested"; + return -EINVAL; + } + + if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } + + return 0; +} + /* - * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> + * Construct a flakey mapping: + * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] + * + * Feature args: + * [drop_writes] + * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] + * + * Nth_byte starts from 1 for the first byte. + * Direction is r for READ or w for WRITE. + * bio_flags is ignored if 0. */ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) { + static struct dm_arg _args[] = { + {0, UINT_MAX, "Invalid up interval"}, + {0, UINT_MAX, "Invalid down interval"}, + }; + + int r; struct flakey_c *fc; - unsigned long long tmp; + unsigned long long tmpll; + struct dm_arg_set as; + const char *devname; - if (argc != 4) { - ti->error = "dm-flakey: Invalid argument count"; + as.argc = argc; + as.argv = argv; + + if (argc < 4) { + ti->error = "Invalid argument count"; return -EINVAL; } - fc = kmalloc(sizeof(*fc), GFP_KERNEL); + fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (!fc) { - ti->error = "dm-flakey: Cannot allocate linear context"; + ti->error = "Cannot allocate linear context"; return -ENOMEM; } fc->start_time = jiffies; - if (sscanf(argv[1], "%llu", &tmp) != 1) { - ti->error = "dm-flakey: Invalid device sector"; + devname = dm_shift_arg(&as); + + if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { + ti->error = "Invalid device sector"; goto bad; } - fc->start = tmp; + fc->start = tmpll; - if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { - ti->error = "dm-flakey: Invalid up interval"; + r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); + if (r) goto bad; - } - if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { - ti->error = "dm-flakey: Invalid down interval"; + r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + if (r) goto bad; - } if (!(fc->up_interval + fc->down_interval)) { - ti->error = "dm-flakey: Total (up + down) interval is zero"; + ti->error = "Total (up + down) interval is zero"; goto bad; } if (fc->up_interval + fc->down_interval < fc->up_interval) { - ti->error = "dm-flakey: Interval overflow"; + ti->error = "Interval overflow"; goto bad; } - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { - ti->error = "dm-flakey: Device lookup failed"; + r = parse_features(&as, fc, ti); + if (r) + goto bad; + + if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { + ti->error = "Device lookup failed"; goto bad; } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = fc; return 0; @@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) { struct flakey_c *fc = ti->private; - return fc->start + (bi_sector - ti->begin); + return fc->start + dm_target_offset(ti, bi_sector); } static void flakey_map_bio(struct dm_target *ti, struct bio *bio) @@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); } +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned bio_bytes = bio_cur_bytes(bio); + char *data = bio_data(bio); + + /* + * Overwrite the Nth byte of the data returned. + */ + if (data && bio_bytes >= fc->corrupt_bio_byte) { + data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; + + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', + bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); + } +} + static int flakey_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { @@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio, /* Are we alive ? */ elapsed = (jiffies - fc->start_time) / HZ; - if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + /* + * Flag this bio as submitted while down. + */ + map_context->ll = 1; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* + * Drop writes? + */ + if (test_bit(DROP_WRITES, &fc->flags)) { + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } + + /* + * Corrupt matching writes. + */ + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + goto map_bio; + } + + /* + * By default, error all I/O. + */ return -EIO; + } +map_bio: flakey_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct flakey_c *fc = ti->private; + unsigned bio_submitted_while_down = map_context->ll; + + /* + * Corrupt successful READs while in down state. + * If flags were specified, only corrupt those that match. + */ + if (!error && bio_submitted_while_down && + (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + + return error; +} + static int flakey_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { + unsigned sz = 0; struct flakey_c *fc = ti->private; + unsigned drop_writes; switch (type) { case STATUSTYPE_INFO: @@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, - (unsigned long long)fc->start, fc->up_interval, - fc->down_interval); + DMEMIT("%s %llu %u %u ", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + + drop_writes = test_bit(DROP_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); + + if (drop_writes) + DMEMIT("drop_writes "); + + if (fc->corrupt_bio_byte) + DMEMIT("corrupt_bio_byte %u %c %u %u ", + fc->corrupt_bio_byte, + (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', + fc->corrupt_bio_value, fc->corrupt_bio_flags); + break; } return 0; @@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ static struct target_type flakey_target = { .name = "flakey", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = flakey_ctr, .dtr = flakey_dtr, .map = flakey_map, + .end_io = flakey_end_io, .status = flakey_status, .ioctl = flakey_ioctl, .merge = flakey_merge, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2067288f61f..ad2eba40e31 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -38,6 +38,8 @@ struct io { struct dm_io_client *client; io_notify_fn callback; void *context; + void *vma_invalidate_address; + unsigned long vma_invalidate_size; } __attribute__((aligned(DM_IO_MAX_REGIONS))); static struct kmem_cache *_dm_io_cache; @@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error) set_bit(region, &io->error_bits); if (atomic_dec_and_test(&io->count)) { + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); + if (io->sleeper) wake_up_process(io->sleeper); @@ -159,6 +165,9 @@ struct dpages { unsigned context_u; void *context_ptr; + + void *vma_invalidate_address; + unsigned long vma_invalidate_size; }; /* @@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->sleeper = current; io->client = client; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 1); while (1) { @@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io->callback = fn; io->context = context; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 0); return 0; } -static int dp_init(struct dm_io_request *io_req, struct dpages *dp) +static int dp_init(struct dm_io_request *io_req, struct dpages *dp, + unsigned long size) { /* Set up dpages based on memory type */ + + dp->vma_invalidate_address = NULL; + dp->vma_invalidate_size = 0; + switch (io_req->mem.type) { case DM_IO_PAGE_LIST: list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); @@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) break; case DM_IO_VMA: + flush_kernel_vmap_range(io_req->mem.ptr.vma, size); + if ((io_req->bi_rw & RW_MASK) == READ) { + dp->vma_invalidate_address = io_req->mem.ptr.vma; + dp->vma_invalidate_size = size; + } vm_dp_init(dp, io_req->mem.ptr.vma); break; @@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, int r; struct dpages dp; - r = dp_init(io_req, &dp); + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); if (r) return r; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4cacdad2270..2e9a3ca37bd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str) return NULL; } +static struct hash_cell *__get_dev_cell(uint64_t dev) +{ + struct mapped_device *md; + struct hash_cell *hc; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + + hc = dm_get_mdptr(md); + if (!hc) { + dm_put(md); + return NULL; + } + + return hc; +} + /*----------------------------------------------------------------- * Inserting, removing and renaming a device. *---------------------------------------------------------------*/ @@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) */ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) { - struct mapped_device *md; - void *mdptr = NULL; + struct hash_cell *hc = NULL; - if (*param->uuid) - return __get_uuid_cell(param->uuid); + if (*param->uuid) { + if (*param->name || param->dev) + return NULL; - if (*param->name) - return __get_name_cell(param->name); + hc = __get_uuid_cell(param->uuid); + if (!hc) + return NULL; + } else if (*param->name) { + if (param->dev) + return NULL; - md = dm_get_md(huge_decode_dev(param->dev)); - if (!md) - goto out; + hc = __get_name_cell(param->name); + if (!hc) + return NULL; + } else if (param->dev) { + hc = __get_dev_cell(param->dev); + if (!hc) + return NULL; + } else + return NULL; - mdptr = dm_get_mdptr(md); - if (!mdptr) - dm_put(md); + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strlcpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); + else + param->uuid[0] = '\0'; -out: - return mdptr; + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + return hc; } static struct mapped_device *find_device(struct dm_ioctl *param) @@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param) down_read(&_hash_lock); hc = __find_device_hash_cell(param); - if (hc) { + if (hc) md = hc->md; - - /* - * Sneakily write in both the name and the uuid - * while we have the cell. - */ - strlcpy(param->name, hc->name, sizeof(param->name)); - if (hc->uuid) - strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); - else - param->uuid[0] = '\0'; - - if (hc->new_map) - param->flags |= DM_INACTIVE_PRESENT_FLAG; - else - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - } up_read(&_hash_lock); return md; @@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size) goto out; } + if (!argc) { + DMWARN("Empty message received."); + goto out; + } + table = dm_get_live_table(md); if (!table) goto out_argv; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 320401dec10..f8214702963 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -224,8 +224,6 @@ struct kcopyd_job { unsigned int num_dests; struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; - sector_t offset; - unsigned int nr_pages; struct page_list *pages; /* @@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job) .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, - .mem.offset = job->offset, + .mem.offset = 0, .notify.fn = complete_io, .notify.context = job, .client = job->kc->io_client, @@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job) static int run_pages_job(struct kcopyd_job *job) { int r; + unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); - job->nr_pages = dm_div_up(job->dests[0].count + job->offset, - PAGE_SIZE >> 9); - r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); if (!r) { /* this job is ready for io */ push(&job->kc->io_jobs, job); @@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->num_dests = num_dests; memcpy(&job->dests, dests, sizeof(*dests) * num_dests); - job->offset = 0; - job->nr_pages = 0; job->pages = NULL; job->fn = fn; @@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, } EXPORT_SYMBOL(dm_kcopyd_copy); +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + job = mempool_alloc(kc->job_pool, GFP_NOIO); + + memset(job, 0, sizeof(struct kcopyd_job)); + job->kc = kc; + job->fn = fn; + job->context = context; + + atomic_inc(&kc->nr_jobs); + + return job; +} +EXPORT_SYMBOL(dm_kcopyd_prepare_callback); + +void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) +{ + struct kcopyd_job *job = j; + struct dm_kcopyd_client *kc = job->kc; + + job->read_err = read_err; + job->write_err = write_err; + + push(&kc->complete_jobs, job); + wake(kc); +} +EXPORT_SYMBOL(dm_kcopyd_do_callback); + /* * Cancels a kcopyd job, eg. someone might be deactivating a * mirror. diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index aa2e0c374ab..1021c898601 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list) group[count] = fe->region; count++; - list_del(&fe->list); - list_add(&fe->list, &tmp_list); + list_move(&fe->list, &tmp_list); type = fe->type; if (count >= MAX_FLUSH_GROUP_COUNT) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 948e3f4925b..3b52bb72bd1 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy); #define MIRROR_DISK_VERSION 2 #define LOG_OFFSET 2 -struct log_header { - uint32_t magic; +struct log_header_disk { + __le32 magic; /* * Simple, incrementing version. no backward * compatibility. */ + __le32 version; + __le64 nr_regions; +} __packed; + +struct log_header_core { + uint32_t magic; uint32_t version; - sector_t nr_regions; + uint64_t nr_regions; }; struct log_c { @@ -239,10 +245,10 @@ struct log_c { int log_dev_failed; int log_dev_flush_failed; struct dm_dev *log_dev; - struct log_header header; + struct log_header_core header; struct dm_io_region header_location; - struct log_header *disk_header; + struct log_header_disk *disk_header; }; /* @@ -251,34 +257,34 @@ struct log_c { */ static inline int log_test_bit(uint32_t *bs, unsigned bit) { - return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; + return test_bit_le(bit, bs) ? 1 : 0; } static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - __test_and_set_bit_le(bit, (unsigned long *) bs); + __set_bit_le(bit, bs); l->touched_cleaned = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - __test_and_clear_bit_le(bit, (unsigned long *) bs); + __clear_bit_le(bit, bs); l->touched_dirtied = 1; } /*---------------------------------------------------------------- * Header IO *--------------------------------------------------------------*/ -static void header_to_disk(struct log_header *core, struct log_header *disk) +static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) { disk->magic = cpu_to_le32(core->magic); disk->version = cpu_to_le32(core->version); disk->nr_regions = cpu_to_le64(core->nr_regions); } -static void header_from_disk(struct log_header *core, struct log_header *disk) +static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) { core->magic = le32_to_cpu(disk->magic); core->version = le32_to_cpu(disk->version); @@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); lc->sync_count = (sync == NOSYNC) ? region_count : 0; - lc->recovering_bits = vmalloc(bitset_size); + lc->recovering_bits = vzalloc(bitset_size); if (!lc->recovering_bits) { DMWARN("couldn't allocate sync bitset"); vfree(lc->sync_bits); @@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, kfree(lc); return -ENOMEM; } - memset(lc->recovering_bits, 0, bitset_size); lc->sync_search = 0; log->context = lc; @@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) return 0; do { - *region = find_next_zero_bit_le( - (unsigned long *) lc->sync_bits, + *region = find_next_zero_bit_le(lc->sync_bits, lc->region_count, lc->sync_search); lc->sync_search = *region + 1; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index c3547016f0f..5e0090ef418 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -22,7 +22,6 @@ #include <linux/atomic.h> #define DM_MSG_PREFIX "multipath" -#define MESG_STR(x) x, sizeof(x) #define DM_PG_INIT_DELAY_MSECS 2000 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) @@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work) * <#paths> <#per-path selector args> * [<path> [<arg>]* ]+ ]+ *---------------------------------------------------------------*/ -struct param { - unsigned min; - unsigned max; - char *error; -}; - -static int read_param(struct param *param, char *str, unsigned *v, char **error) -{ - if (!str || - (sscanf(str, "%u", v) != 1) || - (*v < param->min) || - (*v > param->max)) { - *error = param->error; - return -EINVAL; - } - - return 0; -} - -struct arg_set { - unsigned argc; - char **argv; -}; - -static char *shift(struct arg_set *as) -{ - char *r; - - if (as->argc) { - as->argc--; - r = *as->argv; - as->argv++; - return r; - } - - return NULL; -} - -static void consume(struct arg_set *as, unsigned n) -{ - BUG_ON (as->argc < n); - as->argc -= n; - as->argv += n; -} - -static int parse_path_selector(struct arg_set *as, struct priority_group *pg, +static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, struct dm_target *ti) { int r; struct path_selector_type *pst; unsigned ps_argc; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of path selector args"}, }; - pst = dm_get_path_selector(shift(as)); + pst = dm_get_path_selector(dm_shift_arg(as)); if (!pst) { ti->error = "unknown path selector type"; return -EINVAL; } - r = read_param(_params, shift(as), &ps_argc, &ti->error); + r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); if (r) { dm_put_path_selector(pst); return -EINVAL; } - if (ps_argc > as->argc) { - dm_put_path_selector(pst); - ti->error = "not enough arguments for path selector"; - return -EINVAL; - } - r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); @@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, } pg->ps.type = pst; - consume(as, ps_argc); + dm_consume_args(as, ps_argc); return 0; } -static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, struct dm_target *ti) { int r; @@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, if (!p) return ERR_PTR(-ENOMEM); - r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), &p->path.dev); if (r) { ti->error = "error getting device"; @@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, return ERR_PTR(r); } -static struct priority_group *parse_priority_group(struct arg_set *as, +static struct priority_group *parse_priority_group(struct dm_arg_set *as, struct multipath *m) { - static struct param _params[] = { + static struct dm_arg _args[] = { {1, 1024, "invalid number of paths"}, {0, 1024, "invalid number of selector args"} }; int r; - unsigned i, nr_selector_args, nr_params; + unsigned i, nr_selector_args, nr_args; struct priority_group *pg; struct dm_target *ti = m->ti; @@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as, /* * read the paths */ - r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); + r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); + r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); if (r) goto bad; - nr_params = 1 + nr_selector_args; + nr_args = 1 + nr_selector_args; for (i = 0; i < pg->nr_pgpaths; i++) { struct pgpath *pgpath; - struct arg_set path_args; + struct dm_arg_set path_args; - if (as->argc < nr_params) { + if (as->argc < nr_args) { ti->error = "not enough path parameters"; r = -EINVAL; goto bad; } - path_args.argc = nr_params; + path_args.argc = nr_args; path_args.argv = as->argv; pgpath = parse_path(&path_args, &pg->ps, ti); @@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, pgpath->pg = pg; list_add_tail(&pgpath->list, &pg->pgpaths); - consume(as, nr_params); + dm_consume_args(as, nr_args); } return pg; @@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as, return ERR_PTR(r); } -static int parse_hw_handler(struct arg_set *as, struct multipath *m) +static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) { unsigned hw_argc; int ret; struct dm_target *ti = m->ti; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of hardware handler args"}, }; - if (read_param(_params, shift(as), &hw_argc, &ti->error)) + if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) return -EINVAL; if (!hw_argc) return 0; - if (hw_argc > as->argc) { - ti->error = "not enough arguments for hardware handler"; - return -EINVAL; - } - - m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); request_module("scsi_dh_%s", m->hw_handler_name); if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { ti->error = "unknown hardware handler type"; @@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) j = sprintf(p, "%s", as->argv[i]); } - consume(as, hw_argc - 1); + dm_consume_args(as, hw_argc - 1); return 0; fail: @@ -787,20 +730,20 @@ fail: return ret; } -static int parse_features(struct arg_set *as, struct multipath *m) +static int parse_features(struct dm_arg_set *as, struct multipath *m) { int r; unsigned argc; struct dm_target *ti = m->ti; - const char *param_name; + const char *arg_name; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; - r = read_param(_params, shift(as), &argc, &ti->error); + r = dm_read_arg_group(_args, as, &argc, &ti->error); if (r) return -EINVAL; @@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m) return 0; do { - param_name = shift(as); + arg_name = dm_shift_arg(as); argc--; - if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(arg_name, "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && + if (!strcasecmp(arg_name, "pg_init_retries") && (argc >= 1)) { - r = read_param(_params + 1, shift(as), - &m->pg_init_retries, &ti->error); + r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); argc--; continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + if (!strcasecmp(arg_name, "pg_init_delay_msecs") && (argc >= 1)) { - r = read_param(_params + 2, shift(as), - &m->pg_init_delay_msecs, &ti->error); + r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); argc--; continue; } @@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m) static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - /* target parameters */ - static struct param _params[] = { + /* target arguments */ + static struct dm_arg _args[] = { {0, 1024, "invalid number of priority groups"}, {0, 1024, "invalid initial priority group number"}, }; int r; struct multipath *m; - struct arg_set as; + struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; @@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, if (r) goto bad; - r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); + r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); + r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); if (r) goto bad; @@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) } if (argc == 1) { - if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); goto out; - } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { + } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, 0, 0); goto out; } @@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) goto out; } - if (!strnicmp(argv[0], MESG_STR("disable_group"))) { + if (!strcasecmp(argv[0], "disable_group")) { r = bypass_pg_num(m, argv[1], 1); goto out; - } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { + } else if (!strcasecmp(argv[0], "enable_group")) { r = bypass_pg_num(m, argv[1], 0); goto out; - } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { + } else if (!strcasecmp(argv[0], "switch_group")) { r = switch_pg_num(m, argv[1]); goto out; - } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) + } else if (!strcasecmp(argv[0], "reinstate_path")) action = reinstate_path; - else if (!strnicmp(argv[0], MESG_STR("fail_path"))) + else if (!strcasecmp(argv[0], "fail_path")) action = fail_path; else { DMWARN("Unrecognised multipath message received."); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e5d8904fc8f..a002dd85db1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -8,19 +8,19 @@ #include <linux/slab.h> #include "md.h" +#include "raid1.h" #include "raid5.h" -#include "dm.h" #include "bitmap.h" +#include <linux/device-mapper.h> + #define DM_MSG_PREFIX "raid" /* - * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then - * make it so the flag doesn't set anything. + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. */ -#ifndef MD_SYNC_STATE_FORCED -#define MD_SYNC_STATE_FORCED 0 -#endif +#define FirstUse 10 /* rdev flag */ struct raid_dev { /* @@ -43,14 +43,15 @@ struct raid_dev { /* * Flags for rs->print_flags field. */ -#define DMPF_DAEMON_SLEEP 0x1 -#define DMPF_MAX_WRITE_BEHIND 0x2 -#define DMPF_SYNC 0x4 -#define DMPF_NOSYNC 0x8 -#define DMPF_STRIPE_CACHE 0x10 -#define DMPF_MIN_RECOVERY_RATE 0x20 -#define DMPF_MAX_RECOVERY_RATE 0x40 - +#define DMPF_SYNC 0x1 +#define DMPF_NOSYNC 0x2 +#define DMPF_REBUILD 0x4 +#define DMPF_DAEMON_SLEEP 0x8 +#define DMPF_MIN_RECOVERY_RATE 0x10 +#define DMPF_MAX_RECOVERY_RATE 0x20 +#define DMPF_MAX_WRITE_BEHIND 0x40 +#define DMPF_STRIPE_CACHE 0x80 +#define DMPF_REGION_SIZE 0X100 struct raid_set { struct dm_target *ti; @@ -72,6 +73,7 @@ static struct raid_type { const unsigned level; /* RAID level. */ const unsigned algorithm; /* RAID algorithm. */ } raid_types[] = { + {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, @@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra } sectors_per_dev = ti->len; - if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + if ((raid_type->level > 1) && + sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { ti->error = "Target length not divisible by number of data devices"; return ERR_PTR(-EINVAL); } @@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs) { int i; - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + if (rs->dev[i].rdev.sb_page) + put_page(rs->dev[i].rdev.sb_page); + rs->dev[i].rdev.sb_page = NULL; + rs->dev[i].rdev.sb_loaded = 0; if (rs->dev[i].data_dev) dm_put_device(rs->ti, rs->dev[i].data_dev); + } kfree(rs); } @@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs) * <meta_dev>: meta device name or '-' if missing * <data_dev>: data device name or '-' if missing * - * This code parses those words. + * The following are permitted: + * - - + * - <data_dev> + * <meta_dev> <data_dev> + * + * The following is not allowed: + * <meta_dev> - + * + * This code parses those words. If there is a failure, + * the caller must use context_free to unwind the operations. */ static int dev_parms(struct raid_set *rs, char **argv) { @@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv) rs->dev[i].rdev.mddev = &rs->md; if (strcmp(argv[0], "-")) { - rs->ti->error = "Metadata devices not supported"; - return -EINVAL; + ret = dm_get_device(rs->ti, argv[0], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + rs->ti->error = "RAID metadata device lookup failure"; + if (ret) + return ret; + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) + return -ENOMEM; } if (!strcmp(argv[1], "-")) { @@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return -EINVAL; } + rs->ti->error = "No data device supplied with metadata device"; + if (rs->dev[i].meta_dev) + return -EINVAL; + continue; } @@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return ret; } + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) @@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv) } /* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + DMINFO("Choosing default region size of %lu sectors", + region_size); + region_size = min_region_size; + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = (region_size << 9); + + return 0; +} + +/* * Possible arguments are... - * RAID456: * <chunk_size> [optional_args] * - * Optional args: - * [[no]sync] Force or prevent recovery of the entire array + * Argument definitions + * <chunk_size> The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array * [rebuild <idx>] Rebuild the drive indicated by the index - * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits + * [daemon_sleep <ms>] Time between bitmap daemon work to + * clear bits * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [write_mostly <idx>] Indicate a write mostly drive via index * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + * [region_size <sectors>] Defines granularity of bitmap */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) { unsigned i, rebuild_cnt = 0; - unsigned long value; + unsigned long value, region_size = 0; char *key; /* * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. */ - if ((strict_strtoul(argv[0], 10, &value) < 0) || - !is_power_of_2(value) || (value < 8)) { + if ((strict_strtoul(argv[0], 10, &value) < 0)) { rs->ti->error = "Bad chunk size"; return -EINVAL; + } else if (rs->raid_type->level == 1) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; } rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; @@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv, num_raid_params--; /* - * Second, parse the unordered optional arguments + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. */ - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + /* + * Second, parse the unordered optional arguments + */ for (i = 0; i < num_raid_params; i++) { - if (!strcmp(argv[i], "nosync")) { + if (!strcasecmp(argv[i], "nosync")) { rs->md.recovery_cp = MaxSector; rs->print_flags |= DMPF_NOSYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } - if (!strcmp(argv[i], "sync")) { + if (!strcasecmp(argv[i], "sync")) { rs->md.recovery_cp = 0; rs->print_flags |= DMPF_SYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } @@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } - if (!strcmp(key, "rebuild")) { - if (++rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild drives given"; + if (!strcasecmp(key, "rebuild")) { + rebuild_cnt++; + if (((rs->raid_type->level != 1) && + (rebuild_cnt > rs->raid_type->parity_devs)) || + ((rs->raid_type->level == 1) && + (rebuild_cnt > (rs->md.raid_disks - 1)))) { + rs->ti->error = "Too many rebuild devices specified for given RAID type"; return -EINVAL; } if (value > rs->md.raid_disks) { @@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } clear_bit(In_sync, &rs->dev[value].rdev.flags); rs->dev[value].rdev.recovery_offset = 0; - } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "write_mostly")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid write_mostly drive index given"; + return -EINVAL; + } + set_bit(WriteMostly, &rs->dev[value].rdev.flags); + } else if (!strcasecmp(key, "max_write_behind")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } rs->print_flags |= DMPF_MAX_WRITE_BEHIND; /* @@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } rs->md.bitmap_info.max_write_behind = value; - } else if (!strcmp(key, "daemon_sleep")) { + } else if (!strcasecmp(key, "daemon_sleep")) { rs->print_flags |= DMPF_DAEMON_SLEEP; if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { rs->ti->error = "daemon sleep period out of range"; return -EINVAL; } rs->md.bitmap_info.daemon_sleep = value; - } else if (!strcmp(key, "stripe_cache")) { + } else if (!strcasecmp(key, "stripe_cache")) { rs->print_flags |= DMPF_STRIPE_CACHE; /* @@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "Bad stripe_cache size"; return -EINVAL; } - } else if (!strcmp(key, "min_recovery_rate")) { + } else if (!strcasecmp(key, "min_recovery_rate")) { rs->print_flags |= DMPF_MIN_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "min_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_min = (int)value; - } else if (!strcmp(key, "max_recovery_rate")) { + } else if (!strcasecmp(key, "max_recovery_rate")) { rs->print_flags |= DMPF_MAX_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "max_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_max = (int)value; + } else if (!strcasecmp(key, "region_size")) { + rs->print_flags |= DMPF_REGION_SIZE; + region_size = value; } else { DMERR("Unable to parse RAID parameter: %s", key); rs->ti->error = "Unable to parse RAID parameters"; @@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } } + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + if (rs->raid_type->level == 1) + return md_raid1_congested(&rs->md, bits); + return md_raid5_congested(&rs->md, bits); } /* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 features; /* Used to indicate possible future changes */ + + __le32 num_devices; /* Number of devices in this array. (Max 64) */ + __le32 array_position; /* The position of this drive in the array */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Bit field of devices to indicate failures */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial array + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * RAID characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + __u8 pad[452]; /* Round struct to 512 bytes. */ + /* Always set to 0 when writing. */ +} __packed; + +static int read_disk_sb(mdk_rdev_t *rdev, int size) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { + DMERR("Failed to read device superblock"); + return -EINVAL; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdk_rdev_t *r, *t; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + + sb = page_address(rdev->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + + rdev_for_each(r, t, mddev) + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) + failed_devices |= (1ULL << r->raid_disk); + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->features = cpu_to_le32(0); /* No features yet */ + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + sb->failed_devices = cpu_to_le64(failed_devices); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) +{ + int ret; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + + ret = read_disk_sb(rdev, rdev->sb_size); + if (ret) + return ret; + + sb = page_address(rdev->sb_page); + if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + + /* Force writing of superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int role; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + uint64_t events_sb; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0; + uint32_t rebuilds = 0; + mdk_rdev_t *r, *t; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + failed_devices = le64_to_cpu(sb->failed_devices); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + /* + * Reshaping is not currently allowed + */ + if ((le32_to_cpu(sb->level) != mddev->level) || + (le32_to_cpu(sb->layout) != mddev->layout) || + (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + /* We can only change the number of devices in RAID1 right now */ + if ((rs->raid_type->level != 1) && + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The array is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old array + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + */ + rdev_for_each(r, t, mddev) { + if (!test_bit(In_sync, &r->flags)) { + if (!test_bit(FirstUse, &r->flags)) + DMERR("Superblock area of " + "rebuild device %d should have been " + "cleared.", r->raid_disk); + set_bit(FirstUse, &r->flags); + rebuilds++; + } else if (test_bit(FirstUse, &r->flags)) + new_devs++; + } + + if (!rebuilds) { + if (new_devs == mddev->raid_disks) { + DMINFO("Superblocks created for new array"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs) { + DMERR("New device injected " + "into existing array without 'rebuild' " + "parameter specified"); + return -EINVAL; + } + } else if (new_devs) { + DMERR("'rebuild' devices cannot be " + "injected into an array with other first-time devices"); + return -EINVAL; + } else if (mddev->recovery_cp != MaxSector) { + DMERR("'rebuild' specified while array is not in-sync"); + return -EINVAL; + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + rdev_for_each(r, t, mddev) { + if (!r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role != r->raid_disk) { + if (rs->raid_type->level != 1) { + rs->ti->error = "Cannot change device " + "positions in RAID array"; + return -EINVAL; + } + DMINFO("RAID1 device #%d now at position #%d", + role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (failed_devices & (1 << role)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct dm_raid_superblock *sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(mddev, rdev)) + return -EINVAL; + + mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 4096 >> 9; + if (!test_bit(FirstUse, &rdev->flags)) { + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset != MaxSector) + clear_bit(In_sync, &rdev->flags); + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_bit(Faulty, &rdev->flags)) { + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->recovery_offset = 0; + } + + clear_bit(FirstUse, &rdev->flags); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int ret; + mdk_rdev_t *rdev, *freshest, *tmp; + mddev_t *mddev = &rs->md; + + freshest = NULL; + rdev_for_each(rdev, tmp, mddev) { + if (!rdev->meta_bdev) + continue; + + ret = super_load(rdev, freshest); + + switch (ret) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + ti->error = "Failed to load superblock"; + return ret; + } + } + + if (!freshest) + return 0; + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(mddev, freshest)) + return -EINVAL; + + rdev_for_each(rdev, tmp, mddev) + if ((rdev != freshest) && super_validate(mddev, rdev)) + return -EINVAL; + + return 0; +} + +/* * Construct a RAID4/5/6 mapping: * Args: * <raid_type> <#raid_params> <raid_params> \ * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } * - * ** metadata devices are not supported yet, use '-' instead ** - * * <raid_params> varies by <raid_type>. See 'parse_raid_params' for * details on possible <raid_params>. */ @@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) if (ret) goto bad; + rs->md.sync_super = super_sync; + ret = analyse_superblocks(ti, rs); + if (ret) + goto bad; + INIT_WORK(&rs->md.event_work, do_table_event); - ti->split_io = rs->md.chunk_sectors; ti->private = rs; mutex_lock(&rs->md.reconfig_mutex); @@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); + mddev_suspend(&rs->md); return 0; bad: @@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ - for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + for (i = 0; i < rs->md.raid_disks; i++) { + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) - raid_param_cnt++; /* for rebuilds */ + raid_param_cnt += 2; /* for rebuilds */ + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; + } - raid_param_cnt += (hweight64(rs->print_flags) * 2); + raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) raid_param_cnt--; @@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" nosync"); for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) DMEMIT(" rebuild %u", i); @@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type, if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" write_mostly %u", i); + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) DMEMIT(" max_write_behind %lu", rs->md.bitmap_info.max_write_behind); @@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type, conf ? conf->max_nr_stripes * 2 : 0); } + if (rs->print_flags & DMPF_REGION_SIZE) + DMEMIT(" region_size %lu", + rs->md.bitmap_info.chunksize >> 9); + DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { - DMEMIT(" -"); /* metadata device */ + if (rs->dev[i].meta_dev) + DMEMIT(" %s", rs->dev[i].meta_dev->name); + else + DMEMIT(" -"); if (rs->dev[i].data_dev) DMEMIT(" %s", rs->dev[i].data_dev->name); @@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; + bitmap_load(&rs->md); mddev_resume(&rs->md); } static struct target_type raid_target = { .name = "raid", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 135c2f1fdbf..d1f1d701710 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -58,25 +58,30 @@ #define NUM_SNAPSHOT_HDR_CHUNKS 1 struct disk_header { - uint32_t magic; + __le32 magic; /* * Is this snapshot valid. There is no way of recovering * an invalid snapshot. */ - uint32_t valid; + __le32 valid; /* * Simple, incrementing version. no backward * compatibility. */ - uint32_t version; + __le32 version; /* In sectors */ - uint32_t chunk_size; -}; + __le32 chunk_size; +} __packed; struct disk_exception { + __le64 old_chunk; + __le64 new_chunk; +} __packed; + +struct core_exception { uint64_t old_chunk; uint64_t new_chunk; }; @@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps) if (!ps->area) goto err_area; - ps->zero_area = vmalloc(len); + ps->zero_area = vzalloc(len); if (!ps->zero_area) goto err_zero_area; - memset(ps->zero_area, 0, len); ps->header_area = vmalloc(len); if (!ps->header_area) @@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) } static void read_exception(struct pstore *ps, - uint32_t index, struct disk_exception *result) + uint32_t index, struct core_exception *result) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* copy it */ - result->old_chunk = le64_to_cpu(e->old_chunk); - result->new_chunk = le64_to_cpu(e->new_chunk); + result->old_chunk = le64_to_cpu(de->old_chunk); + result->new_chunk = le64_to_cpu(de->new_chunk); } static void write_exception(struct pstore *ps, - uint32_t index, struct disk_exception *de) + uint32_t index, struct core_exception *e) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* copy it */ - e->old_chunk = cpu_to_le64(de->old_chunk); - e->new_chunk = cpu_to_le64(de->new_chunk); + de->old_chunk = cpu_to_le64(e->old_chunk); + de->new_chunk = cpu_to_le64(e->new_chunk); } static void clear_exception(struct pstore *ps, uint32_t index) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* clear it */ - e->old_chunk = 0; - e->new_chunk = 0; + de->old_chunk = 0; + de->new_chunk = 0; } /* @@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps, { int r; unsigned int i; - struct disk_exception de; + struct core_exception e; /* presume the area is full */ *full = 1; for (i = 0; i < ps->exceptions_per_area; i++) { - read_exception(ps, i, &de); + read_exception(ps, i, &e); /* * If the new_chunk is pointing at the start of @@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps, * is we know that we've hit the end of the * exceptions. Therefore the area is not full. */ - if (de.new_chunk == 0LL) { + if (e.new_chunk == 0LL) { ps->current_committed = i; *full = 0; break; @@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps, /* * Keep track of the start of the free chunks. */ - if (ps->next_free <= de.new_chunk) - ps->next_free = de.new_chunk + 1; + if (ps->next_free <= e.new_chunk) + ps->next_free = e.new_chunk + 1; /* * Otherwise we add the exception to the snapshot. */ - r = callback(callback_context, de.old_chunk, de.new_chunk); + r = callback(callback_context, e.old_chunk, e.new_chunk); if (r) return r; } @@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / sizeof(struct disk_exception); ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); + sizeof(*ps->callbacks)); if (!ps->callbacks) return -ENOMEM; @@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store, { unsigned int i; struct pstore *ps = get_info(store); - struct disk_exception de; + struct core_exception ce; struct commit_callback *cb; - de.old_chunk = e->old_chunk; - de.new_chunk = e->new_chunk; - write_exception(ps, ps->current_committed++, &de); + ce.old_chunk = e->old_chunk; + ce.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &ce); /* * Add the callback to the back of the array. This code @@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, * If we completely filled the current area, then wipe the next one. */ if ((ps->current_committed == ps->exceptions_per_area) && - zero_disk_area(ps, ps->current_area + 1)) + zero_disk_area(ps, ps->current_area + 1)) ps->valid = 0; /* @@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store, chunk_t *last_new_chunk) { struct pstore *ps = get_info(store); - struct disk_exception de; + struct core_exception ce; int nr_consecutive; int r; @@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, ps->current_committed = ps->exceptions_per_area; } - read_exception(ps, ps->current_committed - 1, &de); - *last_old_chunk = de.old_chunk; - *last_new_chunk = de.new_chunk; + read_exception(ps, ps->current_committed - 1, &ce); + *last_old_chunk = ce.old_chunk; + *last_new_chunk = ce.new_chunk; /* * Find number of consecutive chunks within the current area, @@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, for (nr_consecutive = 1; nr_consecutive < ps->current_committed; nr_consecutive++) { read_exception(ps, ps->current_committed - 1 - nr_consecutive, - &de); - if (de.old_chunk != *last_old_chunk - nr_consecutive || - de.new_chunk != *last_new_chunk - nr_consecutive) + &ce); + if (ce.old_chunk != *last_old_chunk - nr_consecutive || + ce.new_chunk != *last_new_chunk - nr_consecutive) break; } @@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, for (i = 0; i < nr_merged; i++) clear_exception(ps, ps->current_committed - 1 - i); - r = area_io(ps, WRITE); + r = area_io(ps, WRITE_FLUSH_FUA); if (r < 0) return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9ecff5f3023..6f758870fc1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; ((ti)->type->name == dm_snapshot_merge_target_name) /* - * The percentage increment we will wake up users at - */ -#define WAKE_UP_PERCENT 5 - -/* - * kcopyd priority of snapshot operations - */ -#define SNAPSHOT_COPY_PRIORITY 2 - -/* * The size of the mempool used to track chunks in use. */ #define MIN_IOS 256 @@ -180,6 +170,13 @@ struct dm_snap_pending_exception { * kcopyd. */ int started; + + /* + * For writing a complete chunk, bypassing the copy. + */ + struct bio *full_bio; + bio_end_io_t *full_bio_end_io; + void *full_bio_private; }; /* @@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { - ti->error = "Cannot allocate snapshot context private " - "structure"; + ti->error = "Cannot allocate private snapshot structure"; r = -ENOMEM; goto bad; } @@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; + struct bio *full_bio = NULL; int error = 0; if (!success) { @@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) */ dm_insert_exception(&s->complete, e); - out: +out: dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); + full_bio = pe->full_bio; + if (full_bio) { + full_bio->bi_end_io = pe->full_bio_end_io; + full_bio->bi_private = pe->full_bio_private; + } free_pending_exception(pe); increment_pending_exceptions_done_count(); @@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) up_write(&s->lock); /* Submit any pending write bios */ - if (error) + if (error) { + if (full_bio) + bio_io_error(full_bio); error_bios(snapshot_bios); - else + } else { + if (full_bio) + bio_endio(full_bio, 0); flush_bios(snapshot_bios); + } retry_origin_bios(s, origin_bios); } @@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - dm_kcopyd_copy(s->kcopyd_client, - &src, 1, &dest, 0, copy_callback, pe); + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); +} + +static void full_bio_end_io(struct bio *bio, int error) +{ + void *callback_data = bio->bi_private; + + dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); +} + +static void start_full_bio(struct dm_snap_pending_exception *pe, + struct bio *bio) +{ + struct dm_snapshot *s = pe->snap; + void *callback_data; + + pe->full_bio = bio; + pe->full_bio_end_io = bio->bi_end_io; + pe->full_bio_private = bio->bi_private; + + callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, + copy_callback, pe); + + bio->bi_end_io = full_bio_end_io; + bio->bi_private = callback_data; + + generic_make_request(bio); } static struct dm_snap_pending_exception * @@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s, bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->started = 0; + pe->full_bio = NULL; if (s->store->type->prepare_exception(s->store, &pe->e)) { free_pending_exception(pe); @@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, } remap_exception(s, &pe->e, bio, chunk); - bio_list_add(&pe->snapshot_bios, bio); r = DM_MAPIO_SUBMITTED; + if (!pe->started && + bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { + pe->started = 1; + up_write(&s->lock); + start_full_bio(pe, bio); + goto out; + } + + bio_list_add(&pe->snapshot_bios, bio); + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; @@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, map_context->ptr = track_chunk(s, chunk); } - out_unlock: +out_unlock: up_write(&s->lock); - out: +out: return r; } @@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, pe_to_start_now = pe; } - next_snapshot: +next_snapshot: up_write(&snap->lock); if (pe_to_start_now) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bfe9c2333ce..986b8754bb0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -54,7 +54,6 @@ struct dm_table { sector_t *highs; struct dm_target *targets; - unsigned discards_supported:1; unsigned integrity_supported:1; /* @@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) return NULL; size = nmemb * elem_size; - addr = vmalloc(size); - if (addr) - memset(addr, 0, size); + addr = vzalloc(size); return addr; } +EXPORT_SYMBOL(dm_vcalloc); /* * highs, and targets are managed as dynamic arrays during a @@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); - t->discards_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t) { atomic_inc(&t->holders); } +EXPORT_SYMBOL(dm_table_get); void dm_table_put(struct dm_table *t) { @@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t) smp_mb__before_atomic_dec(); atomic_dec(&t->holders); } +EXPORT_SYMBOL(dm_table_put); /* * Checks to see if we need to extend highs or targets. @@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, * Add a device to the list, or just increment the usage count if * it's already present. */ -static int __table_get_device(struct dm_table *t, struct dm_target *ti, - const char *path, fmode_t mode, struct dm_dev **result) +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) { int r; dev_t uninitialized_var(dev); struct dm_dev_internal *dd; unsigned int major, minor; + struct dm_table *t = ti->table; BUG_ON(!t); @@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, *result = &dd->dm_dev; return 0; } +EXPORT_SYMBOL(dm_get_device); int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, * If not we'll force DM to use PAGE_SIZE or * smaller I/O, just to be safe. */ - - if (q->merge_bvec_fn && !ti->type->merge) + if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) blk_limits_max_hw_sectors(limits, (unsigned int) (PAGE_SIZE >> 9)); return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result) -{ - return __table_get_device(ti->table, ti, path, mode, result); -} - - /* - * Decrement a devices use count and remove it if necessary. + * Decrement a device's use count and remove it if necessary. */ void dm_put_device(struct dm_target *ti, struct dm_dev *d) { @@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) kfree(dd); } } +EXPORT_SYMBOL(dm_put_device); /* * Checks to see if the target joins onto the end of the table. @@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - if (!tgt->num_discard_requests) - t->discards_supported = 0; + if (!tgt->num_discard_requests && tgt->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", + dm_device_name(t->md), type); return 0; @@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } +/* + * Target argument parsing helpers. + */ +static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error, unsigned grouped) +{ + const char *arg_str = dm_shift_arg(arg_set); + + if (!arg_str || + (sscanf(arg_str, "%u", value) != 1) || + (*value < arg->min) || + (*value > arg->max) || + (grouped && arg_set->argc < *value)) { + *error = arg->error; + return -EINVAL; + } + + return 0; +} + +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 0); +} +EXPORT_SYMBOL(dm_read_arg); + +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 1); +} +EXPORT_SYMBOL(dm_read_arg_group); + +const char *dm_shift_arg(struct dm_arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} +EXPORT_SYMBOL(dm_shift_arg); + +void dm_consume_args(struct dm_arg_set *as, unsigned num_args) +{ + BUG_ON(as->argc < num_args); + as->argc -= num_args; + as->argv += num_args; +} +EXPORT_SYMBOL(dm_consume_args); + static int dm_table_set_type(struct dm_table *t) { unsigned i; @@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t) t->event_fn(t->event_context); mutex_unlock(&_event_lock); } +EXPORT_SYMBOL(dm_table_event); sector_t dm_table_get_size(struct dm_table *t) { return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; } +EXPORT_SYMBOL(dm_table_get_size); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) { @@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t) blk_get_integrity(template_disk)); } +static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned flush = (*(unsigned *)data); + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && (q->flush_flags & flush); +} + +static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Require at least one underlying device to support flushes. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting flushes must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_flush_requests) + continue; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_flush_capable, &flush)) + return 1; + } + + return 0; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { + unsigned flush = 0; + /* * Copy table's limits to the DM device's request_queue */ @@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + if (dm_table_supports_flush(t, REQ_FLUSH)) { + flush |= REQ_FLUSH; + if (dm_table_supports_flush(t, REQ_FUA)) + flush |= REQ_FUA; + } + blk_queue_flush(q, flush); + dm_table_set_integrity(t); /* @@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t) { return t->mode; } +EXPORT_SYMBOL(dm_table_get_mode); static void suspend_targets(struct dm_table *t, unsigned postsuspend) { @@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) { return t->md; } +EXPORT_SYMBOL(dm_table_get_md); static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t) struct dm_target *ti; unsigned i = 0; - if (!t->discards_supported) - return 0; - /* * Unless any target used by the table set discards_supported, * require at least one underlying device to support discards. * t->devices includes internal dm devices such as mirror logs * so we need to use iterate_devices here, which targets - * supporting discard must provide. + * supporting discard selectively must provide. */ while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); + if (!ti->num_discard_requests) + continue; + if (ti->discards_supported) return 1; @@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t) return 0; } - -EXPORT_SYMBOL(dm_vcalloc); -EXPORT_SYMBOL(dm_get_device); -EXPORT_SYMBOL(dm_put_device); -EXPORT_SYMBOL(dm_table_event); -EXPORT_SYMBOL(dm_table_get_size); -EXPORT_SYMBOL(dm_table_get_mode); -EXPORT_SYMBOL(dm_table_get_md); -EXPORT_SYMBOL(dm_table_put); -EXPORT_SYMBOL(dm_table_get); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0cf68b47887..52b39f335bb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -37,6 +37,8 @@ static const char *_name = DM_NAME; static unsigned int major = 0; static unsigned int _major = 0; +static DEFINE_IDR(_minor_idr); + static DEFINE_SPINLOCK(_minor_lock); /* * For bio-based dm. @@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_MERGE_IS_OPTIONAL 6 /* * Work processed by per-device workqueue. @@ -313,6 +316,12 @@ static void __exit dm_exit(void) while (i--) _exits[i](); + + /* + * Should be empty by this point. + */ + idr_remove_all(&_minor_idr); + idr_destroy(&_minor_idr); } /* @@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci) /* * Even though the device advertised discard support, - * reconfiguration might have changed that since the + * that does not mean every target supports it, and + * reconfiguration might also have changed that since the * check was performed. */ if (!ti->num_discard_requests) @@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits) /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ -static DEFINE_IDR(_minor_idr); - static void free_minor(int minor) { spin_lock(&_minor_lock); @@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md) blk_queue_make_request(md->queue, dm_request); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); blk_queue_merge_bvec(md->queue, dm_merge_bvec); - blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); } /* @@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size) } /* + * Return 1 if the queue has a compulsory merge_bvec_fn function. + * + * If this function returns 0, then the device is either a non-dm + * device without a merge_bvec_fn, or it is a dm device that is + * able to split any bios it receives that are too big. + */ +int dm_queue_merge_is_compulsory(struct request_queue *q) +{ + struct mapped_device *dev_md; + + if (!q->merge_bvec_fn) + return 0; + + if (q->make_request_fn == dm_request) { + dev_md = q->queuedata; + if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) + return 0; + } + + return 1; +} + +static int dm_device_merge_is_compulsory(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + return dm_queue_merge_is_compulsory(q); +} + +/* + * Return 1 if it is acceptable to ignore merge_bvec_fn based + * on the properties of the underlying devices. + */ +static int dm_table_merge_is_optional(struct dm_table *table) +{ + unsigned i = 0; + struct dm_target *ti; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) + return 0; + } + + return 1; +} + +/* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, @@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct request_queue *q = md->queue; sector_t size; unsigned long flags; + int merge_is_optional; size = dm_table_get_size(t); @@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, __bind_mempools(md, t); + merge_is_optional = dm_table_merge_is_optional(t); + write_lock_irqsave(&md->map_lock, flags); old_map = md->map; md->map = t; dm_table_set_restrictions(t, q, limits); + if (merge_is_optional) + set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + else + clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); write_unlock_irqrestore(&md->map_lock, flags); return old_map; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1aaf16746da..6745dbd278a 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +int dm_queue_merge_is_compulsory(struct request_queue *q); + void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); void dm_set_md_type(struct mapped_device *md, unsigned type); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4427e045405..3fa1f3d90ce 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -208,6 +208,49 @@ struct dm_target_callbacks { int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); +/* + * Target argument parsing. + */ +struct dm_arg_set { + unsigned argc; + char **argv; +}; + +/* + * The minimum and maximum value of a numeric argument, together with + * the error message to use if the number is found to be outside that range. + */ +struct dm_arg { + unsigned min; + unsigned max; + char *error; +}; + +/* + * Validate the next argument, either returning it as *value or, if invalid, + * returning -EINVAL and setting *error. + */ +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error); + +/* + * Process the next argument as the start of a group containing between + * arg->min and arg->max further arguments. Either return the size as + * *num_args or, if invalid, return -EINVAL and set *error. + */ +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *num_args, char **error); + +/* + * Return the current argument and shift to the next. + */ +const char *dm_shift_arg(struct dm_arg_set *as); + +/* + * Move through num_args arguments. + */ +void dm_consume_args(struct dm_arg_set *as, unsigned num_args); + /*----------------------------------------------------------------- * Functions for creating and manipulating mapped devices. * Drop the reference with dm_put when you finish with the object. diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 3708455ee6c..0cb8eff76bd 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 20 +#define DM_VERSION_MINOR 21 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2011-02-02)" +#define DM_VERSION_EXTRA "-ioctl (2011-07-06)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h index 298d587e349..5e54458e920 100644 --- a/include/linux/dm-kcopyd.h +++ b/include/linux/dm-kcopyd.h @@ -42,5 +42,20 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, unsigned num_dests, struct dm_io_region *dests, unsigned flags, dm_kcopyd_notify_fn fn, void *context); +/* + * Prepare a callback and submit it via the kcopyd thread. + * + * dm_kcopyd_prepare_callback allocates a callback structure and returns it. + * It must not be called from interrupt context. + * The returned value should be passed into dm_kcopyd_do_callback. + * + * dm_kcopyd_do_callback submits the callback. + * It may be called from interrupt context. + * The callback is issued from the kcopyd thread. + */ +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context); +void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err); + #endif /* __KERNEL__ */ #endif /* _LINUX_DM_KCOPYD_H */ |