From 13c87583ea4e867211fc3e7edab750c353c47c95 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 2 Aug 2011 12:32:03 +0100 Subject: dm raid: cleanup parameter handling Re-order the parameters so they are handled consistently in the same order where defined, parsed and output. Only include rebuild parameters in the STATUSTYPE_TABLE output if they were supplied in the original table line. Correct the parameter count when outputting rebuild: there are two words, not one. Use case-independent checks for keywords (as in other device-mapper targets). Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e5d8904fc8f..efa960ff5ba 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -43,13 +43,14 @@ struct raid_dev { /* * Flags for rs->print_flags field. */ -#define DMPF_DAEMON_SLEEP 0x1 -#define DMPF_MAX_WRITE_BEHIND 0x2 -#define DMPF_SYNC 0x4 -#define DMPF_NOSYNC 0x8 -#define DMPF_STRIPE_CACHE 0x10 -#define DMPF_MIN_RECOVERY_RATE 0x20 -#define DMPF_MAX_RECOVERY_RATE 0x40 +#define DMPF_SYNC 0x1 +#define DMPF_NOSYNC 0x2 +#define DMPF_REBUILD 0x4 +#define DMPF_DAEMON_SLEEP 0x8 +#define DMPF_MIN_RECOVERY_RATE 0x10 +#define DMPF_MAX_RECOVERY_RATE 0x20 +#define DMPF_MAX_WRITE_BEHIND 0x40 +#define DMPF_STRIPE_CACHE 0x80 struct raid_set { struct dm_target *ti; @@ -275,13 +276,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv, set_bit(In_sync, &rs->dev[i].rdev.flags); for (i = 0; i < num_raid_params; i++) { - if (!strcmp(argv[i], "nosync")) { + if (!strcasecmp(argv[i], "nosync")) { rs->md.recovery_cp = MaxSector; rs->print_flags |= DMPF_NOSYNC; rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } - if (!strcmp(argv[i], "sync")) { + if (!strcasecmp(argv[i], "sync")) { rs->md.recovery_cp = 0; rs->print_flags |= DMPF_SYNC; rs->md.flags |= MD_SYNC_STATE_FORCED; @@ -300,7 +301,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } - if (!strcmp(key, "rebuild")) { + if (!strcasecmp(key, "rebuild")) { if (++rebuild_cnt > rs->raid_type->parity_devs) { rs->ti->error = "Too many rebuild drives given"; return -EINVAL; @@ -311,7 +312,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } clear_bit(In_sync, &rs->dev[value].rdev.flags); rs->dev[value].rdev.recovery_offset = 0; - } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "max_write_behind")) { rs->print_flags |= DMPF_MAX_WRITE_BEHIND; /* @@ -324,14 +326,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } rs->md.bitmap_info.max_write_behind = value; - } else if (!strcmp(key, "daemon_sleep")) { + } else if (!strcasecmp(key, "daemon_sleep")) { rs->print_flags |= DMPF_DAEMON_SLEEP; if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { rs->ti->error = "daemon sleep period out of range"; return -EINVAL; } rs->md.bitmap_info.daemon_sleep = value; - } else if (!strcmp(key, "stripe_cache")) { + } else if (!strcasecmp(key, "stripe_cache")) { rs->print_flags |= DMPF_STRIPE_CACHE; /* @@ -348,14 +350,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "Bad stripe_cache size"; return -EINVAL; } - } else if (!strcmp(key, "min_recovery_rate")) { + } else if (!strcasecmp(key, "min_recovery_rate")) { rs->print_flags |= DMPF_MIN_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "min_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_min = (int)value; - } else if (!strcmp(key, "max_recovery_rate")) { + } else if (!strcasecmp(key, "max_recovery_rate")) { rs->print_flags |= DMPF_MAX_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "max_recovery_rate out of range"; @@ -547,11 +549,12 @@ static int raid_status(struct dm_target *ti, status_type_t type, case STATUSTYPE_TABLE: /* The string you would use to construct this array */ for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) - raid_param_cnt++; /* for rebuilds */ + raid_param_cnt += 2; /* for rebuilds */ - raid_param_cnt += (hweight64(rs->print_flags) * 2); + raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) raid_param_cnt--; @@ -565,7 +568,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" nosync"); for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) DMEMIT(" rebuild %u", i); -- cgit v1.2.3-70-g09d2 From 3e8dbb7f3966c80d77272f8b4f430babc99f6595 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Tue, 2 Aug 2011 12:32:03 +0100 Subject: dm raid: tidy includes A dm target only needs to use include/linux dm headers. Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index efa960ff5ba..d4e95b2e39f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -9,9 +9,10 @@ #include "md.h" #include "raid5.h" -#include "dm.h" #include "bitmap.h" +#include + #define DM_MSG_PREFIX "raid" /* -- cgit v1.2.3-70-g09d2 From c1084561bb85da3630540ebe951749a8cd8fc714 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 2 Aug 2011 12:32:07 +0100 Subject: dm raid: add region_size parameter Allow the user to specify the region_size. Ensures that the supplied value meets md's constraints, viz. the number of regions does not exceed 2^21. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/dm-raid.txt | 4 ++ drivers/md/dm-raid.c | 82 +++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 3 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 4f9dd3cecc1..be4419a3078 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -52,6 +52,10 @@ The target is named "raid" and it accepts the following parameters: [max_recovery_rate ] Throttle RAID initialization [max_write_behind ] See '-write-behind=' (man mdadm) [stripe_cache ] Stripe cache size (higher RAIDs only) + [region_size ] + The region_size multiplied by the number of regions is the + logical size of the array. The bitmap records the device + synchronisation state for each region. <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index d4e95b2e39f..a8a1915a450 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -52,7 +52,7 @@ struct raid_dev { #define DMPF_MAX_RECOVERY_RATE 0x20 #define DMPF_MAX_WRITE_BEHIND 0x40 #define DMPF_STRIPE_CACHE 0x80 - +#define DMPF_REGION_SIZE 0X100 struct raid_set { struct dm_target *ti; @@ -236,6 +236,67 @@ static int dev_parms(struct raid_set *rs, char **argv) return 0; } +/* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + DMINFO("Choosing default region size of %lu sectors", + region_size); + region_size = min_region_size; + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = (region_size << 9); + + return 0; +} + /* * Possible arguments are... * RAID456: @@ -249,12 +310,13 @@ static int dev_parms(struct raid_set *rs, char **argv) * [max_recovery_rate ] Throttle RAID initialization * [max_write_behind ] See '-write-behind=' (man mdadm) * [stripe_cache ] Stripe cache size for higher RAIDs + * [region_size ] Defines granularity of bitmap */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) { unsigned i, rebuild_cnt = 0; - unsigned long value; + unsigned long value, region_size = 0; char *key; /* @@ -365,6 +427,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } rs->md.sync_speed_max = (int)value; + } else if (!strcasecmp(key, "region_size")) { + rs->print_flags |= DMPF_REGION_SIZE; + region_size = value; } else { DMERR("Unable to parse RAID parameter: %s", key); rs->ti->error = "Unable to parse RAID parameters"; @@ -372,6 +437,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } } + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -469,7 +542,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; INIT_WORK(&rs->md.event_work, do_table_event); - ti->split_io = rs->md.chunk_sectors; ti->private = rs; mutex_lock(&rs->md.reconfig_mutex); @@ -596,6 +668,10 @@ static int raid_status(struct dm_target *ti, status_type_t type, conf ? conf->max_nr_stripes * 2 : 0); } + if (rs->print_flags & DMPF_REGION_SIZE) + DMEMIT(" region_size %lu", + rs->md.bitmap_info.chunksize >> 9); + DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { DMEMIT(" -"); /* metadata device */ -- cgit v1.2.3-70-g09d2 From 46bed2b5c16bb7c82e1088d7ae75fb958c8a8c4e Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 2 Aug 2011 12:32:07 +0100 Subject: dm raid: add write_mostly parameter Add the write_mostly parameter to RAID1 dm-raid tables. This allows the user to set the WriteMostly flag on a RAID1 device that should normally be avoided for read I/O. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/dm-raid.txt | 4 +++- drivers/md/dm-raid.c | 26 +++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index be4419a3078..a7d1c4abc92 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -50,6 +50,7 @@ The target is named "raid" and it accepts the following parameters: [min_recovery_rate ] Throttle RAID initialization [max_recovery_rate ] Throttle RAID initialization + [write_mostly ] Drive index is write-mostly [max_write_behind ] See '-write-behind=' (man mdadm) [stripe_cache ] Stripe cache size (higher RAIDs only) [region_size ] @@ -87,9 +88,10 @@ Example tables 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 'dmsetup table' displays the table used to construct the mapping. -The optional parameters will always be printed in the order listed +The optional parameters are always printed in the order listed above with "sync" or "nosync" always output ahead of the other arguments, regardless of the order used when originally loading the table. +Arguments that can be repeated are ordered by value. 'dmsetup status' yields information on the state and health of the array. diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index a8a1915a450..88143a0303d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -308,6 +308,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) * [daemon_sleep ] Time between bitmap daemon work to clear bits * [min_recovery_rate ] Throttle RAID initialization * [max_recovery_rate ] Throttle RAID initialization + * [write_mostly ] Indicate a write mostly drive via index * [max_write_behind ] See '-write-behind=' (man mdadm) * [stripe_cache ] Stripe cache size for higher RAIDs * [region_size ] Defines granularity of bitmap @@ -376,7 +377,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv, clear_bit(In_sync, &rs->dev[value].rdev.flags); rs->dev[value].rdev.recovery_offset = 0; rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "write_mostly")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid write_mostly drive index given"; + return -EINVAL; + } + set_bit(WriteMostly, &rs->dev[value].rdev.flags); } else if (!strcasecmp(key, "max_write_behind")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } rs->print_flags |= DMPF_MAX_WRITE_BEHIND; /* @@ -621,11 +636,15 @@ static int raid_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { if ((rs->print_flags & DMPF_REBUILD) && rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) raid_param_cnt += 2; /* for rebuilds */ + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; + } raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) @@ -656,6 +675,11 @@ static int raid_status(struct dm_target *ti, status_type_t type, if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" write_mostly %u", i); + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) DMEMIT(" max_write_behind %lu", rs->md.bitmap_info.max_write_behind); -- cgit v1.2.3-70-g09d2 From b12d437b73d32203a41fde0d407e91812c866844 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 2 Aug 2011 12:32:07 +0100 Subject: dm raid: support metadata devices Add the ability to parse and use metadata devices to dm-raid. Although not strictly required, without the metadata devices, many features of RAID are unavailable. They are used to store a superblock and bitmap. The role, or position in the array, of each device must be recorded in its superblock. This is to help with fault handling, array reshaping, and sanity checks. RAID 4/5/6 devices must be loaded in a specific order: in this way, the 'array_position' field helps validate the correctness of the mapping when it is loaded. It can be used during reshaping to identify which devices are added/removed. Fault handling is impossible without this field. For example, when a device fails it is recorded in the superblock. If this is a RAID1 device and the offending device is removed from the array, there must be a way during subsequent array assembly to determine that the failed device was the one removed. This is done by correlating the 'array_position' field and the bit-field variable 'failed_devices'. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/dm-raid.txt | 12 +- drivers/md/Kconfig | 5 +- drivers/md/dm-raid.c | 419 ++++++++++++++++++++++++++++++-- 3 files changed, 412 insertions(+), 24 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index a7d1c4abc92..2a8c11331d2 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -11,6 +11,7 @@ The target is named "raid" and it accepts the following parameters: <#raid_devs> [.. ] : + raid1 RAID1 mirroring raid4 RAID4 dedicated parity disk raid5_la RAID5 left asymmetric - rotating parity 0 with data continuation @@ -61,8 +62,7 @@ The target is named "raid" and it accepts the following parameters: <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the - data. Currently, separate metadata devices are not supported and '-' - is required in place of the metadata device. + data. If a drive has failed or is missing at creation time, a '-' can be given for both the metadata and data drives for a given position. @@ -70,7 +70,7 @@ The target is named "raid" and it accepts the following parameters: Example tables -------------- -# RAID4 - 4 data drives, 1 parity +# RAID4 - 4 data drives, 1 parity (no metadata devices) # No metadata devices specified to hold superblock/bitmap info # Chunk size of 1MiB # (Lines separated for easy reading) @@ -79,13 +79,13 @@ Example tables raid4 1 2048 \ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 -# RAID4 - 4 data drives, 1 parity (no metadata devices) +# RAID4 - 4 data drives, 1 parity (with metadata devices) # Chunk size of 1MiB, force RAID initialization, # min recovery rate at 20 kiB/sec/disk 0 1960893648 raid \ - raid4 4 2048 min_recovery_rate 20 sync\ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 'dmsetup table' displays the table used to construct the mapping. The optional parameters are always printed in the order listed diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8420129fc5e..f75a66e7d31 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -241,12 +241,13 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 4/5/6 target (EXPERIMENTAL)" + tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID1 select MD_RAID456 select BLK_DEV_MD ---help--- - A dm target that supports RAID4, RAID5 and RAID6 mappings + A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 88143a0303d..69873806c50 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -16,12 +16,10 @@ #define DM_MSG_PREFIX "raid" /* - * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then - * make it so the flag doesn't set anything. + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. */ -#ifndef MD_SYNC_STATE_FORCED -#define MD_SYNC_STATE_FORCED 0 -#endif +#define FirstUse 10 /* rdev flag */ struct raid_dev { /* @@ -149,9 +147,16 @@ static void context_free(struct raid_set *rs) { int i; - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + if (rs->dev[i].rdev.sb_page) + put_page(rs->dev[i].rdev.sb_page); + rs->dev[i].rdev.sb_page = NULL; + rs->dev[i].rdev.sb_loaded = 0; if (rs->dev[i].data_dev) dm_put_device(rs->ti, rs->dev[i].data_dev); + } kfree(rs); } @@ -161,7 +166,16 @@ static void context_free(struct raid_set *rs) * : meta device name or '-' if missing * : data device name or '-' if missing * - * This code parses those words. + * The following are permitted: + * - - + * - + * + * + * The following is not allowed: + * - + * + * This code parses those words. If there is a failure, + * the caller must use context_free to unwind the operations. */ static int dev_parms(struct raid_set *rs, char **argv) { @@ -184,8 +198,16 @@ static int dev_parms(struct raid_set *rs, char **argv) rs->dev[i].rdev.mddev = &rs->md; if (strcmp(argv[0], "-")) { - rs->ti->error = "Metadata devices not supported"; - return -EINVAL; + ret = dm_get_device(rs->ti, argv[0], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + rs->ti->error = "RAID metadata device lookup failure"; + if (ret) + return ret; + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) + return -ENOMEM; } if (!strcmp(argv[1], "-")) { @@ -195,6 +217,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return -EINVAL; } + rs->ti->error = "No data device supplied with metadata device"; + if (rs->dev[i].meta_dev) + return -EINVAL; + continue; } @@ -206,6 +232,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return ret; } + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) @@ -334,22 +364,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv, num_raid_params--; /* - * Second, parse the unordered optional arguments + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. */ - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + /* + * Second, parse the unordered optional arguments + */ for (i = 0; i < num_raid_params; i++) { if (!strcasecmp(argv[i], "nosync")) { rs->md.recovery_cp = MaxSector; rs->print_flags |= DMPF_NOSYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } if (!strcasecmp(argv[i], "sync")) { rs->md.recovery_cp = 0; rs->print_flags |= DMPF_SYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } @@ -481,14 +528,345 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) return md_raid5_congested(&rs->md, bits); } +/* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 features; /* Used to indicate possible future changes */ + + __le32 num_devices; /* Number of devices in this array. (Max 64) */ + __le32 array_position; /* The position of this drive in the array */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Bit field of devices to indicate failures */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial array + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * RAID characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + __u8 pad[452]; /* Round struct to 512 bytes. */ + /* Always set to 0 when writing. */ +} __packed; + +static int read_disk_sb(mdk_rdev_t *rdev, int size) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { + DMERR("Failed to read device superblock"); + return -EINVAL; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdk_rdev_t *r, *t; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + + sb = page_address(rdev->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + + rdev_for_each(r, t, mddev) + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) + failed_devices |= (1ULL << r->raid_disk); + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->features = cpu_to_le32(0); /* No features yet */ + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + sb->failed_devices = cpu_to_le64(failed_devices); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) +{ + int ret; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + + ret = read_disk_sb(rdev, rdev->sb_size); + if (ret) + return ret; + + sb = page_address(rdev->sb_page); + if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + + /* Force writing of superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int role; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + uint64_t events_sb; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0; + uint32_t rebuilds = 0; + mdk_rdev_t *r, *t; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + failed_devices = le64_to_cpu(sb->failed_devices); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + /* + * Reshaping is not currently allowed + */ + if ((le32_to_cpu(sb->level) != mddev->level) || + (le32_to_cpu(sb->layout) != mddev->layout) || + (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + /* We can only change the number of devices in RAID1 right now */ + if ((rs->raid_type->level != 1) && + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The array is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old array + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + */ + rdev_for_each(r, t, mddev) { + if (!test_bit(In_sync, &r->flags)) { + if (!test_bit(FirstUse, &r->flags)) + DMERR("Superblock area of " + "rebuild device %d should have been " + "cleared.", r->raid_disk); + set_bit(FirstUse, &r->flags); + rebuilds++; + } else if (test_bit(FirstUse, &r->flags)) + new_devs++; + } + + if (!rebuilds) { + if (new_devs == mddev->raid_disks) { + DMINFO("Superblocks created for new array"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs) { + DMERR("New device injected " + "into existing array without 'rebuild' " + "parameter specified"); + return -EINVAL; + } + } else if (new_devs) { + DMERR("'rebuild' devices cannot be " + "injected into an array with other first-time devices"); + return -EINVAL; + } else if (mddev->recovery_cp != MaxSector) { + DMERR("'rebuild' specified while array is not in-sync"); + return -EINVAL; + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + rdev_for_each(r, t, mddev) { + if (!r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role != r->raid_disk) { + if (rs->raid_type->level != 1) { + rs->ti->error = "Cannot change device " + "positions in RAID array"; + return -EINVAL; + } + DMINFO("RAID1 device #%d now at position #%d", + role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (failed_devices & (1 << role)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct dm_raid_superblock *sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(mddev, rdev)) + return -EINVAL; + + mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 4096 >> 9; + if (!test_bit(FirstUse, &rdev->flags)) { + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset != MaxSector) + clear_bit(In_sync, &rdev->flags); + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_bit(Faulty, &rdev->flags)) { + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->recovery_offset = 0; + } + + clear_bit(FirstUse, &rdev->flags); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int ret; + mdk_rdev_t *rdev, *freshest, *tmp; + mddev_t *mddev = &rs->md; + + freshest = NULL; + rdev_for_each(rdev, tmp, mddev) { + if (!rdev->meta_bdev) + continue; + + ret = super_load(rdev, freshest); + + switch (ret) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + ti->error = "Failed to load superblock"; + return ret; + } + } + + if (!freshest) + return 0; + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(mddev, freshest)) + return -EINVAL; + + rdev_for_each(rdev, tmp, mddev) + if ((rdev != freshest) && super_validate(mddev, rdev)) + return -EINVAL; + + return 0; +} + /* * Construct a RAID4/5/6 mapping: * Args: * <#raid_params> \ * <#raid_devs> { .. } * - * ** metadata devices are not supported yet, use '-' instead ** - * * varies by . See 'parse_raid_params' for * details on possible . */ @@ -556,6 +934,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) if (ret) goto bad; + rs->md.sync_super = super_sync; + ret = analyse_superblocks(ti, rs); + if (ret) + goto bad; + INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -698,7 +1081,10 @@ static int raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { - DMEMIT(" -"); /* metadata device */ + if (rs->dev[i].meta_dev) + DMEMIT(" %s", rs->dev[i].meta_dev->name); + else + DMEMIT(" -"); if (rs->dev[i].data_dev) DMEMIT(" %s", rs->dev[i].data_dev->name); @@ -755,6 +1141,7 @@ static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; + bitmap_load(&rs->md); mddev_resume(&rs->md); } -- cgit v1.2.3-70-g09d2 From 327372797c88b24953f454cd51a3734c02697bdd Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 2 Aug 2011 12:32:07 +0100 Subject: dm raid: add md raid1 support Support the MD RAID1 personality through dm-raid. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 69873806c50..a002dd85db1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -8,6 +8,7 @@ #include #include "md.h" +#include "raid1.h" #include "raid5.h" #include "bitmap.h" @@ -72,6 +73,7 @@ static struct raid_type { const unsigned level; /* RAID level. */ const unsigned algorithm; /* RAID algorithm. */ } raid_types[] = { + {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, @@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra } sectors_per_dev = ti->len; - if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + if ((raid_type->level > 1) && + sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { ti->error = "Target length not divisible by number of data devices"; return ERR_PTR(-EINVAL); } @@ -329,13 +332,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) /* * Possible arguments are... - * RAID456: * [optional_args] * - * Optional args: - * [[no]sync] Force or prevent recovery of the entire array + * Argument definitions + * The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array * [rebuild ] Rebuild the drive indicated by the index - * [daemon_sleep ] Time between bitmap daemon work to clear bits + * [daemon_sleep ] Time between bitmap daemon work to + * clear bits * [min_recovery_rate ] Throttle RAID initialization * [max_recovery_rate ] Throttle RAID initialization * [write_mostly ] Indicate a write mostly drive via index @@ -352,11 +358,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. */ - if ((strict_strtoul(argv[0], 10, &value) < 0) || - !is_power_of_2(value) || (value < 8)) { + if ((strict_strtoul(argv[0], 10, &value) < 0)) { rs->ti->error = "Bad chunk size"; return -EINVAL; + } else if (rs->raid_type->level == 1) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; } rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; @@ -413,8 +429,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } if (!strcasecmp(key, "rebuild")) { - if (++rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild drives given"; + rebuild_cnt++; + if (((rs->raid_type->level != 1) && + (rebuild_cnt > rs->raid_type->parity_devs)) || + ((rs->raid_type->level == 1) && + (rebuild_cnt > (rs->md.raid_disks - 1)))) { + rs->ti->error = "Too many rebuild devices specified for given RAID type"; return -EINVAL; } if (value > rs->md.raid_disks) { @@ -507,6 +527,11 @@ static int parse_raid_params(struct raid_set *rs, char **argv, else rs->ti->split_io = region_size; + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -525,6 +550,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + if (rs->raid_type->level == 1) + return md_raid1_congested(&rs->md, bits); + return md_raid5_congested(&rs->md, bits); } @@ -955,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); + mddev_suspend(&rs->md); return 0; bad: @@ -1147,7 +1176,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3-70-g09d2