summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/bitmap.c137
-rw-r--r--drivers/md/bitmap.h5
-rw-r--r--drivers/md/dm-crypt.c64
-rw-r--r--drivers/md/dm-flakey.c270
-rw-r--r--drivers/md/dm-io.c29
-rw-r--r--drivers/md/dm-ioctl.c89
-rw-r--r--drivers/md/dm-kcopyd.c44
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log.c32
-rw-r--r--drivers/md/dm-mpath.c149
-rw-r--r--drivers/md/dm-queue-length.c2
-rw-r--r--drivers/md/dm-raid.c621
-rw-r--r--drivers/md/dm-snap-persistent.c80
-rw-r--r--drivers/md/dm-snap.c84
-rw-r--r--drivers/md/dm-table.c157
-rw-r--r--drivers/md/dm.c75
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/md.c871
-rw-r--r--drivers/md/md.h110
-rw-r--r--drivers/md/raid1.c962
-rw-r--r--drivers/md/raid1.h26
-rw-r--r--drivers/md/raid10.c1183
-rw-r--r--drivers/md/raid10.h21
-rw-r--r--drivers/md/raid5.c1015
-rw-r--r--drivers/md/raid5.h99
26 files changed, 4416 insertions, 1719 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5e..f75a66e7d31 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
needed for live data migration tools such as 'pvmove'.
config DM_RAID
- tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+ tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
+ select MD_RAID1
select MD_RAID456
select BLK_DEV_MD
---help---
- A dm target that supports RAID4, RAID5 and RAID6 mappings
+ A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
A RAID-5 set of N drives with a capacity of C MB per drive provides
the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 574b09afedd..0dc6546b77a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -29,7 +29,6 @@
#include "md.h"
#include "bitmap.h"
-#include <linux/dm-dirty-log.h>
/* debug macros */
#define DEBUG 0
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
* 0 or page 1
*/
static inline struct page *filemap_get_page(struct bitmap *bitmap,
- unsigned long chunk)
+ unsigned long chunk)
{
- if (bitmap->filemap == NULL)
- return NULL;
if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
return NULL;
return bitmap->filemap[file_page_index(bitmap, chunk)
@@ -878,28 +875,19 @@ enum bitmap_page_attr {
static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr)
{
- if (page)
- __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
- else
- __set_bit(attr, &bitmap->logattrs);
+ __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
}
static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr)
{
- if (page)
- __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
- else
- __clear_bit(attr, &bitmap->logattrs);
+ __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
}
static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr)
{
- if (page)
- return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
- else
- return test_bit(attr, &bitmap->logattrs);
+ return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
}
/*
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
{
unsigned long bit;
- struct page *page = NULL;
+ struct page *page;
void *kaddr;
unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
- if (!bitmap->filemap) {
- struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
- if (log)
- log->type->mark_region(log, chunk);
- } else {
+ if (!bitmap->filemap)
+ return;
- page = filemap_get_page(bitmap, chunk);
- if (!page)
- return;
- bit = file_page_offset(bitmap, chunk);
+ page = filemap_get_page(bitmap, chunk);
+ if (!page)
+ return;
+ bit = file_page_offset(bitmap, chunk);
- /* set the bit */
- kaddr = kmap_atomic(page, KM_USER0);
- if (bitmap->flags & BITMAP_HOSTENDIAN)
- set_bit(bit, kaddr);
- else
- __test_and_set_bit_le(bit, kaddr);
- kunmap_atomic(kaddr, KM_USER0);
- PRINTK("set file bit %lu page %lu\n", bit, page->index);
- }
+ /* set the bit */
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (bitmap->flags & BITMAP_HOSTENDIAN)
+ set_bit(bit, kaddr);
+ else
+ __set_bit_le(bit, kaddr);
+ kunmap_atomic(kaddr, KM_USER0);
+ PRINTK("set file bit %lu page %lu\n", bit, page->index);
/* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
}
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
if (!bitmap)
return;
- if (!bitmap->filemap) {
- /* Must be using a dirty_log */
- struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
- dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
- need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
- if (dirty || need_write)
- if (log->type->flush(log))
- bitmap->flags |= BITMAP_WRITE_ERROR;
- goto out;
- }
/* look at each page to see if there are any set bits that need to be
* flushed out to disk */
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
else
md_super_wait(bitmap->mddev);
}
-out:
if (bitmap->flags & BITMAP_WRITE_ERROR)
bitmap_file_kick(bitmap);
}
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
struct page *page = NULL, *lastpage = NULL;
sector_t blocks;
void *paddr;
- struct dm_dirty_log *log = mddev->bitmap_info.log;
/* Use a mutex to guard daemon_work against
* bitmap_destroy.
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
spin_lock_irqsave(&bitmap->lock, flags);
for (j = 0; j < bitmap->chunks; j++) {
bitmap_counter_t *bmc;
- if (!bitmap->filemap) {
- if (!log)
- /* error or shutdown */
- break;
- } else
- page = filemap_get_page(bitmap, j);
+ if (!bitmap->filemap)
+ /* error or shutdown */
+ break;
+
+ page = filemap_get_page(bitmap, j);
if (page != lastpage) {
/* skip this page unless it's marked as needing cleaning */
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
-1);
/* clear the bit */
- if (page) {
- paddr = kmap_atomic(page, KM_USER0);
- if (bitmap->flags & BITMAP_HOSTENDIAN)
- clear_bit(file_page_offset(bitmap, j),
- paddr);
- else
- __test_and_clear_bit_le(file_page_offset(bitmap, j),
- paddr);
- kunmap_atomic(paddr, KM_USER0);
- } else
- log->type->clear_region(log, j);
+ paddr = kmap_atomic(page, KM_USER0);
+ if (bitmap->flags & BITMAP_HOSTENDIAN)
+ clear_bit(file_page_offset(bitmap, j),
+ paddr);
+ else
+ __clear_bit_le(
+ file_page_offset(bitmap,
+ j),
+ paddr);
+ kunmap_atomic(paddr, KM_USER0);
}
} else
j |= PAGE_COUNTER_MASK;
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
spin_unlock_irqrestore(&bitmap->lock, flags);
/* now sync the final page */
- if (lastpage != NULL || log != NULL) {
+ if (lastpage != NULL) {
spin_lock_irqsave(&bitmap->lock, flags);
if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
- if (lastpage)
- write_page(bitmap, lastpage, 0);
- else
- if (log->type->flush(log))
- bitmap->flags |= BITMAP_WRITE_ERROR;
+ write_page(bitmap, lastpage, 0);
} else {
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
if (!file
- && !mddev->bitmap_info.offset
- && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
+ && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
return 0;
BUG_ON(file && mddev->bitmap_info.offset);
- BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap)
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
int bitmap_load(mddev_t *mddev)
{
int err = 0;
+ sector_t start = 0;
sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap;
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
}
bitmap_close_sync(bitmap);
- if (mddev->bitmap_info.log) {
- unsigned long i;
- struct dm_dirty_log *log = mddev->bitmap_info.log;
- for (i = 0; i < bitmap->chunks; i++)
- if (!log->type->in_sync(log, i, 1))
- bitmap_set_memory_bits(bitmap,
- (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
- 1);
- } else {
- sector_t start = 0;
- if (mddev->degraded == 0
- || bitmap->events_cleared == mddev->events)
- /* no need to keep dirty bits to optimise a
- * re-add of a missing device */
- start = mddev->recovery_cp;
-
- err = bitmap_init_from_disk(bitmap, start);
- }
+ if (mddev->degraded == 0
+ || bitmap->events_cleared == mddev->events)
+ /* no need to keep dirty bits to optimise a
+ * re-add of a missing device */
+ start = mddev->recovery_cp;
+
+ err = bitmap_init_from_disk(bitmap, start);
+
if (err)
goto out;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b2a127e891a..a28f2e5588c 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -212,10 +212,6 @@ struct bitmap {
unsigned long file_pages; /* number of pages in the file */
int last_page_size; /* bytes in the last page */
- unsigned long logattrs; /* used when filemap_attr doesn't exist
- * because we are working with a dirty_log
- */
-
unsigned long flags;
int allclean;
@@ -237,7 +233,6 @@ struct bitmap {
wait_queue_head_t behind_wait;
struct sysfs_dirent *sysfs_can_clear;
-
};
/* the bitmap API */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index c8827ffd85b..49da55c1528 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -19,7 +19,7 @@
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
#include <linux/percpu.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
#include <asm/unaligned.h>
@@ -30,7 +30,6 @@
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "crypt"
-#define MESG_STR(x) x, sizeof(x)
/*
* context holding the current state of a multi-part conversion
@@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
- *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
+ *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
return 0;
}
@@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
- *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+ *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
return 0;
}
@@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
memset(iv, 0, cc->iv_size);
- *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+ *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
return 0;
@@ -1575,11 +1574,17 @@ bad_mem:
static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct crypt_config *cc;
- unsigned int key_size;
+ unsigned int key_size, opt_params;
unsigned long long tmpll;
int ret;
+ struct dm_arg_set as;
+ const char *opt_string;
+
+ static struct dm_arg _args[] = {
+ {0, 1, "Invalid number of feature args"},
+ };
- if (argc != 5) {
+ if (argc < 5) {
ti->error = "Not enough arguments";
return -EINVAL;
}
@@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->start = tmpll;
+ argv += 5;
+ argc -= 5;
+
+ /* Optional parameters */
+ if (argc) {
+ as.argc = argc;
+ as.argv = argv;
+
+ ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (ret)
+ goto bad;
+
+ opt_string = dm_shift_arg(&as);
+
+ if (opt_params == 1 && opt_string &&
+ !strcasecmp(opt_string, "allow_discards"))
+ ti->num_discard_requests = 1;
+ else if (opt_params) {
+ ret = -EINVAL;
+ ti->error = "Invalid feature arguments";
+ goto bad;
+ }
+ }
+
ret = -ENOMEM;
cc->io_queue = alloc_workqueue("kcryptd_io",
WQ_NON_REENTRANT|
@@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
struct dm_crypt_io *io;
struct crypt_config *cc;
- if (bio->bi_rw & REQ_FLUSH) {
+ /*
+ * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
+ * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+ * - for REQ_DISCARD caller must use flush if IO ordering matters
+ */
+ if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
cc = ti->private;
bio->bi_bdev = cc->dev->bdev;
+ if (bio_sectors(bio))
+ bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
return DM_MAPIO_REMAPPED;
}
@@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
cc->dev->name, (unsigned long long)cc->start);
+
+ if (ti->num_discard_requests)
+ DMEMIT(" 1 allow_discards");
+
break;
}
return 0;
@@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
if (argc < 2)
goto error;
- if (!strnicmp(argv[0], MESG_STR("key"))) {
+ if (!strcasecmp(argv[0], "key")) {
if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
DMWARN("not suspended during key manipulation.");
return -EINVAL;
}
- if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+ if (argc == 3 && !strcasecmp(argv[1], "set")) {
ret = crypt_set_key(cc, argv[2]);
if (ret)
return ret;
@@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
ret = cc->iv_gen_ops->init(cc);
return ret;
}
- if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+ if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
ret = cc->iv_gen_ops->wipe(cc);
if (ret)
@@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ea790623c30..89f73ca22cf 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2003 Sistina Software (UK) Limited.
- * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -15,6 +15,9 @@
#define DM_MSG_PREFIX "flakey"
+#define all_corrupt_bio_flags_match(bio, fc) \
+ (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+
/*
* Flakey: Used for testing only, simulates intermittent,
* catastrophic device failure.
@@ -25,60 +28,189 @@ struct flakey_c {
sector_t start;
unsigned up_interval;
unsigned down_interval;
+ unsigned long flags;
+ unsigned corrupt_bio_byte;
+ unsigned corrupt_bio_rw;
+ unsigned corrupt_bio_value;
+ unsigned corrupt_bio_flags;
+};
+
+enum feature_flag_bits {
+ DROP_WRITES
};
+static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
+ struct dm_target *ti)
+{
+ int r;
+ unsigned argc;
+ const char *arg_name;
+
+ static struct dm_arg _args[] = {
+ {0, 6, "Invalid number of feature args"},
+ {1, UINT_MAX, "Invalid corrupt bio byte"},
+ {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
+ {0, UINT_MAX, "Invalid corrupt bio flags mask"},
+ };
+
+ /* No feature arguments supplied. */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return r;
+
+ while (argc) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ /*
+ * drop_writes
+ */
+ if (!strcasecmp(arg_name, "drop_writes")) {
+ if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
+ ti->error = "Feature drop_writes duplicated";
+ return -EINVAL;
+ }
+
+ continue;
+ }
+
+ /*
+ * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
+ */
+ if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
+ if (!argc)
+ ti->error = "Feature corrupt_bio_byte requires parameters";
+
+ r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
+ if (r)
+ return r;
+ argc--;
+
+ /*
+ * Direction r or w?
+ */
+ arg_name = dm_shift_arg(as);
+ if (!strcasecmp(arg_name, "w"))
+ fc->corrupt_bio_rw = WRITE;
+ else if (!strcasecmp(arg_name, "r"))
+ fc->corrupt_bio_rw = READ;
+ else {
+ ti->error = "Invalid corrupt bio direction (r or w)";
+ return -EINVAL;
+ }
+ argc--;
+
+ /*
+ * Value of byte (0-255) to write in place of correct one.
+ */
+ r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
+ if (r)
+ return r;
+ argc--;
+
+ /*
+ * Only corrupt bios with these flags set.
+ */
+ r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
+ if (r)
+ return r;
+ argc--;
+
+ continue;
+ }
+
+ ti->error = "Unrecognised flakey feature requested";
+ return -EINVAL;
+ }
+
+ if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
+ ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/*
- * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ * Construct a flakey mapping:
+ * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
+ *
+ * Feature args:
+ * [drop_writes]
+ * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
+ *
+ * Nth_byte starts from 1 for the first byte.
+ * Direction is r for READ or w for WRITE.
+ * bio_flags is ignored if 0.
*/
static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
+ static struct dm_arg _args[] = {
+ {0, UINT_MAX, "Invalid up interval"},
+ {0, UINT_MAX, "Invalid down interval"},
+ };
+
+ int r;
struct flakey_c *fc;
- unsigned long long tmp;
+ unsigned long long tmpll;
+ struct dm_arg_set as;
+ const char *devname;
- if (argc != 4) {
- ti->error = "dm-flakey: Invalid argument count";
+ as.argc = argc;
+ as.argv = argv;
+
+ if (argc < 4) {
+ ti->error = "Invalid argument count";
return -EINVAL;
}
- fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ fc = kzalloc(sizeof(*fc), GFP_KERNEL);
if (!fc) {
- ti->error = "dm-flakey: Cannot allocate linear context";
+ ti->error = "Cannot allocate linear context";
return -ENOMEM;
}
fc->start_time = jiffies;
- if (sscanf(argv[1], "%llu", &tmp) != 1) {
- ti->error = "dm-flakey: Invalid device sector";
+ devname = dm_shift_arg(&as);
+
+ if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+ ti->error = "Invalid device sector";
goto bad;
}
- fc->start = tmp;
+ fc->start = tmpll;
- if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
- ti->error = "dm-flakey: Invalid up interval";
+ r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
+ if (r)
goto bad;
- }
- if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
- ti->error = "dm-flakey: Invalid down interval";
+ r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+ if (r)
goto bad;
- }
if (!(fc->up_interval + fc->down_interval)) {
- ti->error = "dm-flakey: Total (up + down) interval is zero";
+ ti->error = "Total (up + down) interval is zero";
goto bad;
}
if (fc->up_interval + fc->down_interval < fc->up_interval) {
- ti->error = "dm-flakey: Interval overflow";
+ ti->error = "Interval overflow";
goto bad;
}
- if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
- ti->error = "dm-flakey: Device lookup failed";
+ r = parse_features(&as, fc, ti);
+ if (r)
+ goto bad;
+
+ if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
+ ti->error = "Device lookup failed";
goto bad;
}
ti->num_flush_requests = 1;
+ ti->num_discard_requests = 1;
ti->private = fc;
return 0;
@@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
{
struct flakey_c *fc = ti->private;
- return fc->start + (bi_sector - ti->begin);
+ return fc->start + dm_target_offset(ti, bi_sector);
}
static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
@@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
}
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+{
+ unsigned bio_bytes = bio_cur_bytes(bio);
+ char *data = bio_data(bio);
+
+ /*
+ * Overwrite the Nth byte of the data returned.
+ */
+ if (data && bio_bytes >= fc->corrupt_bio_byte) {
+ data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
+
+ DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+ "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
+ bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+ (bio_data_dir(bio) == WRITE) ? 'w' : 'r',
+ bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+ }
+}
+
static int flakey_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
@@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
/* Are we alive ? */
elapsed = (jiffies - fc->start_time) / HZ;
- if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+ if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
+ /*
+ * Flag this bio as submitted while down.
+ */
+ map_context->ll = 1;
+
+ /*
+ * Map reads as normal.
+ */
+ if (bio_data_dir(bio) == READ)
+ goto map_bio;
+
+ /*
+ * Drop writes?
+ */
+ if (test_bit(DROP_WRITES, &fc->flags)) {
+ bio_endio(bio, 0);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * Corrupt matching writes.
+ */
+ if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
+ if (all_corrupt_bio_flags_match(bio, fc))
+ corrupt_bio_data(bio, fc);
+ goto map_bio;
+ }
+
+ /*
+ * By default, error all I/O.
+ */
return -EIO;
+ }
+map_bio:
flakey_map_bio(ti, bio);
return DM_MAPIO_REMAPPED;
}
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+ int error, union map_info *map_context)
+{
+ struct flakey_c *fc = ti->private;
+ unsigned bio_submitted_while_down = map_context->ll;
+
+ /*
+ * Corrupt successful READs while in down state.
+ * If flags were specified, only corrupt those that match.
+ */
+ if (!error && bio_submitted_while_down &&
+ (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
+ all_corrupt_bio_flags_match(bio, fc))
+ corrupt_bio_data(bio, fc);
+
+ return error;
+}
+
static int flakey_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
+ unsigned sz = 0;
struct flakey_c *fc = ti->private;
+ unsigned drop_writes;
switch (type) {
case STATUSTYPE_INFO:
@@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
- (unsigned long long)fc->start, fc->up_interval,
- fc->down_interval);
+ DMEMIT("%s %llu %u %u ", fc->dev->name,
+ (unsigned long long)fc->start, fc->up_interval,
+ fc->down_interval);
+
+ drop_writes = test_bit(DROP_WRITES, &fc->flags);
+ DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
+
+ if (drop_writes)
+ DMEMIT("drop_writes ");
+
+ if (fc->corrupt_bio_byte)
+ DMEMIT("corrupt_bio_byte %u %c %u %u ",
+ fc->corrupt_bio_byte,
+ (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
+ fc->corrupt_bio_value, fc->corrupt_bio_flags);
+
break;
}
return 0;
@@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
.map = flakey_map,
+ .end_io = flakey_end_io,
.status = flakey_status,
.ioctl = flakey_ioctl,
.merge = flakey_merge,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2067288f61f..ad2eba40e31 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -38,6 +38,8 @@ struct io {
struct dm_io_client *client;
io_notify_fn callback;
void *context;
+ void *vma_invalidate_address;
+ unsigned long vma_invalidate_size;
} __attribute__((aligned(DM_IO_MAX_REGIONS)));
static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error)
set_bit(region, &io->error_bits);
if (atomic_dec_and_test(&io->count)) {
+ if (io->vma_invalidate_size)
+ invalidate_kernel_vmap_range(io->vma_invalidate_address,
+ io->vma_invalidate_size);
+
if (io->sleeper)
wake_up_process(io->sleeper);
@@ -159,6 +165,9 @@ struct dpages {
unsigned context_u;
void *context_ptr;
+
+ void *vma_invalidate_address;
+ unsigned long vma_invalidate_size;
};
/*
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
io->sleeper = current;
io->client = client;
+ io->vma_invalidate_address = dp->vma_invalidate_address;
+ io->vma_invalidate_size = dp->vma_invalidate_size;
+
dispatch_io(rw, num_regions, where, dp, io, 1);
while (1) {
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io->callback = fn;
io->context = context;
+ io->vma_invalidate_address = dp->vma_invalidate_address;
+ io->vma_invalidate_size = dp->vma_invalidate_size;
+
dispatch_io(rw, num_regions, where, dp, io, 0);
return 0;
}
-static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
+ unsigned long size)
{
/* Set up dpages based on memory type */
+
+ dp->vma_invalidate_address = NULL;
+ dp->vma_invalidate_size = 0;
+
switch (io_req->mem.type) {
case DM_IO_PAGE_LIST:
list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
break;
case DM_IO_VMA:
+ flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
+ if ((io_req->bi_rw & RW_MASK) == READ) {
+ dp->vma_invalidate_address = io_req->mem.ptr.vma;
+ dp->vma_invalidate_size = size;
+ }
vm_dp_init(dp, io_req->mem.ptr.vma);
break;
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
int r;
struct dpages dp;
- r = dp_init(io_req, &dp);
+ r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
if (r)
return r;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4cacdad2270..2e9a3ca37bd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str)
return NULL;
}
+static struct hash_cell *__get_dev_cell(uint64_t dev)
+{
+ struct mapped_device *md;
+ struct hash_cell *hc;
+
+ md = dm_get_md(huge_decode_dev(dev));
+ if (!md)
+ return NULL;
+
+ hc = dm_get_mdptr(md);
+ if (!hc) {
+ dm_put(md);
+ return NULL;
+ }
+
+ return hc;
+}
+
/*-----------------------------------------------------------------
* Inserting, removing and renaming a device.
*---------------------------------------------------------------*/
@@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
*/
static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
{
- struct mapped_device *md;
- void *mdptr = NULL;
+ struct hash_cell *hc = NULL;
- if (*param->uuid)
- return __get_uuid_cell(param->uuid);
+ if (*param->uuid) {
+ if (*param->name || param->dev)
+ return NULL;
- if (*param->name)
- return __get_name_cell(param->name);
+ hc = __get_uuid_cell(param->uuid);
+ if (!hc)
+ return NULL;
+ } else if (*param->name) {
+ if (param->dev)
+ return NULL;
- md = dm_get_md(huge_decode_dev(param->dev));
- if (!md)
- goto out;
+ hc = __get_name_cell(param->name);
+ if (!hc)
+ return NULL;
+ } else if (param->dev) {
+ hc = __get_dev_cell(param->dev);
+ if (!hc)
+ return NULL;
+ } else
+ return NULL;
- mdptr = dm_get_mdptr(md);
- if (!mdptr)
- dm_put(md);
+ /*
+ * Sneakily write in both the name and the uuid
+ * while we have the cell.
+ */
+ strlcpy(param->name, hc->name, sizeof(param->name));
+ if (hc->uuid)
+ strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
+ else
+ param->uuid[0] = '\0';
-out:
- return mdptr;
+ if (hc->new_map)
+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
+ else
+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+ return hc;
}
static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
down_read(&_hash_lock);
hc = __find_device_hash_cell(param);
- if (hc) {
+ if (hc)
md = hc->md;
-
- /*
- * Sneakily write in both the name and the uuid
- * while we have the cell.
- */
- strlcpy(param->name, hc->name, sizeof(param->name));
- if (hc->uuid)
- strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
- else
- param->uuid[0] = '\0';
-
- if (hc->new_map)
- param->flags |= DM_INACTIVE_PRESENT_FLAG;
- else
- param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
- }
up_read(&_hash_lock);
return md;
@@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
goto out;
}
+ if (!argc) {
+ DMWARN("Empty message received.");
+ goto out;
+ }
+
table = dm_get_live_table(md);
if (!table)
goto out_argv;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 819e37eaaeb..f8214702963 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -10,7 +10,7 @@
*/
#include <linux/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -224,8 +224,6 @@ struct kcopyd_job {
unsigned int num_dests;
struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
- sector_t offset;
- unsigned int nr_pages;
struct page_list *pages;
/*
@@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job)
.bi_rw = job->rw,
.mem.type = DM_IO_PAGE_LIST,
.mem.ptr.pl = job->pages,
- .mem.offset = job->offset,
+ .mem.offset = 0,
.notify.fn = complete_io,
.notify.context = job,
.client = job->kc->io_client,
@@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job)
static int run_pages_job(struct kcopyd_job *job)
{
int r;
+ unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
- job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
- PAGE_SIZE >> 9);
- r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+ r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
if (!r) {
/* this job is ready for io */
push(&job->kc->io_jobs, job);
@@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->num_dests = num_dests;
memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
- job->offset = 0;
- job->nr_pages = 0;
job->pages = NULL;
job->fn = fn;
@@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
}
EXPORT_SYMBOL(dm_kcopyd_copy);
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+ dm_kcopyd_notify_fn fn, void *context)
+{
+ struct kcopyd_job *job;
+
+ job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+ memset(job, 0, sizeof(struct kcopyd_job));
+ job->kc = kc;
+ job->fn = fn;
+ job->context = context;
+
+ atomic_inc(&kc->nr_jobs);
+
+ return job;
+}
+EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
+
+void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
+{
+ struct kcopyd_job *job = j;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ job->read_err = read_err;
+ job->write_err = write_err;
+
+ push(&kc->complete_jobs, job);
+ wake(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_do_callback);
+
/*
* Cancels a kcopyd job, eg. someone might be deactivating a
* mirror.
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index aa2e0c374ab..1021c898601 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
group[count] = fe->region;
count++;
- list_del(&fe->list);
- list_add(&fe->list, &tmp_list);
+ list_move(&fe->list, &tmp_list);
type = fe->type;
if (count >= MAX_FLUSH_GROUP_COUNT)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 948e3f4925b..3b52bb72bd1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy);
#define MIRROR_DISK_VERSION 2
#define LOG_OFFSET 2
-struct log_header {
- uint32_t magic;
+struct log_header_disk {
+ __le32 magic;
/*
* Simple, incrementing version. no backward
* compatibility.
*/
+ __le32 version;
+ __le64 nr_regions;
+} __packed;
+
+struct log_header_core {
+ uint32_t magic;
uint32_t version;
- sector_t nr_regions;
+ uint64_t nr_regions;
};
struct log_c {
@@ -239,10 +245,10 @@ struct log_c {
int log_dev_failed;
int log_dev_flush_failed;
struct dm_dev *log_dev;
- struct log_header header;
+ struct log_header_core header;
struct dm_io_region header_location;
- struct log_header *disk_header;
+ struct log_header_disk *disk_header;
};
/*
@@ -251,34 +257,34 @@ struct log_c {
*/
static inline int log_test_bit(uint32_t *bs, unsigned bit)
{
- return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
+ return test_bit_le(bit, bs) ? 1 : 0;
}
static inline void log_set_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
- __test_and_set_bit_le(bit, (unsigned long *) bs);
+ __set_bit_le(bit, bs);
l->touched_cleaned = 1;
}
static inline void log_clear_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
- __test_and_clear_bit_le(bit, (unsigned long *) bs);
+ __clear_bit_le(bit, bs);
l->touched_dirtied = 1;
}
/*----------------------------------------------------------------
* Header IO
*--------------------------------------------------------------*/
-static void header_to_disk(struct log_header *core, struct log_header *disk)
+static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
{
disk->magic = cpu_to_le32(core->magic);
disk->version = cpu_to_le32(core->version);
disk->nr_regions = cpu_to_le64(core->nr_regions);
}
-static void header_from_disk(struct log_header *core, struct log_header *disk)
+static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
{
core->magic = le32_to_cpu(disk->magic);
core->version = le32_to_cpu(disk->version);
@@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
lc->sync_count = (sync == NOSYNC) ? region_count : 0;
- lc->recovering_bits = vmalloc(bitset_size);
+ lc->recovering_bits = vzalloc(bitset_size);
if (!lc->recovering_bits) {
DMWARN("couldn't allocate sync bitset");
vfree(lc->sync_bits);
@@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
kfree(lc);
return -ENOMEM;
}
- memset(lc->recovering_bits, 0, bitset_size);
lc->sync_search = 0;
log->context = lc;
@@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
return 0;
do {
- *region = find_next_zero_bit_le(
- (unsigned long *) lc->sync_bits,
+ *region = find_next_zero_bit_le(lc->sync_bits,
lc->region_count,
lc->sync_search);
lc->sync_search = *region + 1;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aa4e570c2cb..5e0090ef418 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -19,10 +19,9 @@
#include <linux/time.h>
#include <linux/workqueue.h>
#include <scsi/scsi_dh.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#define DM_MSG_PREFIX "multipath"
-#define MESG_STR(x) x, sizeof(x)
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work)
* <#paths> <#per-path selector args>
* [<path> [<arg>]* ]+ ]+
*---------------------------------------------------------------*/
-struct param {
- unsigned min;
- unsigned max;
- char *error;
-};
-
-static int read_param(struct param *param, char *str, unsigned *v, char **error)
-{
- if (!str ||
- (sscanf(str, "%u", v) != 1) ||
- (*v < param->min) ||
- (*v > param->max)) {
- *error = param->error;
- return -EINVAL;
- }
-
- return 0;
-}
-
-struct arg_set {
- unsigned argc;
- char **argv;
-};
-
-static char *shift(struct arg_set *as)
-{
- char *r;
-
- if (as->argc) {
- as->argc--;
- r = *as->argv;
- as->argv++;
- return r;
- }
-
- return NULL;
-}
-
-static void consume(struct arg_set *as, unsigned n)
-{
- BUG_ON (as->argc < n);
- as->argc -= n;
- as->argv += n;
-}
-
-static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
+static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
struct dm_target *ti)
{
int r;
struct path_selector_type *pst;
unsigned ps_argc;
- static struct param _params[] = {
+ static struct dm_arg _args[] = {
{0, 1024, "invalid number of path selector args"},
};
- pst = dm_get_path_selector(shift(as));
+ pst = dm_get_path_selector(dm_shift_arg(as));
if (!pst) {
ti->error = "unknown path selector type";
return -EINVAL;
}
- r = read_param(_params, shift(as), &ps_argc, &ti->error);
+ r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
if (r) {
dm_put_path_selector(pst);
return -EINVAL;
}
- if (ps_argc > as->argc) {
- dm_put_path_selector(pst);
- ti->error = "not enough arguments for path selector";
- return -EINVAL;
- }
-
r = pst->create(&pg->ps, ps_argc, as->argv);
if (r) {
dm_put_path_selector(pst);
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
}
pg->ps.type = pst;
- consume(as, ps_argc);
+ dm_consume_args(as, ps_argc);
return 0;
}
-static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
struct dm_target *ti)
{
int r;
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
if (!p)
return ERR_PTR(-ENOMEM);
- r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
+ r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
&p->path.dev);
if (r) {
ti->error = "error getting device";
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
return ERR_PTR(r);
}
-static struct priority_group *parse_priority_group(struct arg_set *as,
+static struct priority_group *parse_priority_group(struct dm_arg_set *as,
struct multipath *m)
{
- static struct param _params[] = {
+ static struct dm_arg _args[] = {
{1, 1024, "invalid number of paths"},
{0, 1024, "invalid number of selector args"}
};
int r;
- unsigned i, nr_selector_args, nr_params;
+ unsigned i, nr_selector_args, nr_args;
struct priority_group *pg;
struct dm_target *ti = m->ti;
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
/*
* read the paths
*/
- r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
+ r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
if (r)
goto bad;
- r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
+ r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
if (r)
goto bad;
- nr_params = 1 + nr_selector_args;
+ nr_args = 1 + nr_selector_args;
for (i = 0; i < pg->nr_pgpaths; i++) {
struct pgpath *pgpath;
- struct arg_set path_args;
+ struct dm_arg_set path_args;
- if (as->argc < nr_params) {
+ if (as->argc < nr_args) {
ti->error = "not enough path parameters";
r = -EINVAL;
goto bad;
}
- path_args.argc = nr_params;
+ path_args.argc = nr_args;
path_args.argv = as->argv;
pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
pgpath->pg = pg;
list_add_tail(&pgpath->list, &pg->pgpaths);
- consume(as, nr_params);
+ dm_consume_args(as, nr_args);
}
return pg;
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
return ERR_PTR(r);
}
-static int parse_hw_handler(struct arg_set *as, struct multipath *m)
+static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
{
unsigned hw_argc;
int ret;
struct dm_target *ti = m->ti;
- static struct param _params[] = {
+ static struct dm_arg _args[] = {
{0, 1024, "invalid number of hardware handler args"},
};
- if (read_param(_params, shift(as), &hw_argc, &ti->error))
+ if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
return -EINVAL;
if (!hw_argc)
return 0;
- if (hw_argc > as->argc) {
- ti->error = "not enough arguments for hardware handler";
- return -EINVAL;
- }
-
- m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
+ m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
request_module("scsi_dh_%s", m->hw_handler_name);
if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
j = sprintf(p, "%s", as->argv[i]);
}
- consume(as, hw_argc - 1);
+ dm_consume_args(as, hw_argc - 1);
return 0;
fail:
@@ -787,20 +730,20 @@ fail:
return ret;
}
-static int parse_features(struct arg_set *as, struct multipath *m)
+static int parse_features(struct dm_arg_set *as, struct multipath *m)
{
int r;
unsigned argc;
struct dm_target *ti = m->ti;
- const char *param_name;
+ const char *arg_name;
- static struct param _params[] = {
+ static struct dm_arg _args[] = {
{0, 5, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
};
- r = read_param(_params, shift(as), &argc, &ti->error);
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
if (r)
return -EINVAL;
@@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m)
return 0;
do {
- param_name = shift(as);
+ arg_name = dm_shift_arg(as);
argc--;
- if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
+ if (!strcasecmp(arg_name, "queue_if_no_path")) {
r = queue_if_no_path(m, 1, 0);
continue;
}
- if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
+ if (!strcasecmp(arg_name, "pg_init_retries") &&
(argc >= 1)) {
- r = read_param(_params + 1, shift(as),
- &m->pg_init_retries, &ti->error);
+ r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
argc--;
continue;
}
- if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+ if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
(argc >= 1)) {
- r = read_param(_params + 2, shift(as),
- &m->pg_init_delay_msecs, &ti->error);
+ r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
argc--;
continue;
}
@@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m)
static int multipath_ctr(struct dm_target *ti, unsigned int argc,
char **argv)
{
- /* target parameters */
- static struct param _params[] = {
+ /* target arguments */
+ static struct dm_arg _args[] = {
{0, 1024, "invalid number of priority groups"},
{0, 1024, "invalid initial priority group number"},
};
int r;
struct multipath *m;
- struct arg_set as;
+ struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
@@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
if (r)
goto bad;
- r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
+ r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
if (r)
goto bad;
- r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
+ r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
if (r)
goto bad;
@@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
}
if (argc == 1) {
- if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
+ if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, 1, 0);
goto out;
- } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
+ } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, 0, 0);
goto out;
}
@@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
goto out;
}
- if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
+ if (!strcasecmp(argv[0], "disable_group")) {
r = bypass_pg_num(m, argv[1], 1);
goto out;
- } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
+ } else if (!strcasecmp(argv[0], "enable_group")) {
r = bypass_pg_num(m, argv[1], 0);
goto out;
- } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
+ } else if (!strcasecmp(argv[0], "switch_group")) {
r = switch_pg_num(m, argv[1]);
goto out;
- } else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+ } else if (!strcasecmp(argv[0], "reinstate_path"))
action = reinstate_path;
- else if (!strnicmp(argv[0], MESG_STR("fail_path")))
+ else if (!strcasecmp(argv[0], "fail_path"))
action = fail_path;
else {
DMWARN("Unrecognised multipath message received.");
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index f92b6cea9d9..03a837aa5ce 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -20,7 +20,7 @@
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/module.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#define DM_MSG_PREFIX "multipath queue-length"
#define QL_MIN_IO 128
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e5d8904fc8f..a002dd85db1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -8,19 +8,19 @@
#include <linux/slab.h>
#include "md.h"
+#include "raid1.h"
#include "raid5.h"
-#include "dm.h"
#include "bitmap.h"
+#include <linux/device-mapper.h>
+
#define DM_MSG_PREFIX "raid"
/*
- * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
- * make it so the flag doesn't set anything.
+ * The following flags are used by dm-raid.c to set up the array state.
+ * They must be cleared before md_run is called.
*/
-#ifndef MD_SYNC_STATE_FORCED
-#define MD_SYNC_STATE_FORCED 0
-#endif
+#define FirstUse 10 /* rdev flag */
struct raid_dev {
/*
@@ -43,14 +43,15 @@ struct raid_dev {
/*
* Flags for rs->print_flags field.
*/
-#define DMPF_DAEMON_SLEEP 0x1
-#define DMPF_MAX_WRITE_BEHIND 0x2
-#define DMPF_SYNC 0x4
-#define DMPF_NOSYNC 0x8
-#define DMPF_STRIPE_CACHE 0x10
-#define DMPF_MIN_RECOVERY_RATE 0x20
-#define DMPF_MAX_RECOVERY_RATE 0x40
-
+#define DMPF_SYNC 0x1
+#define DMPF_NOSYNC 0x2
+#define DMPF_REBUILD 0x4
+#define DMPF_DAEMON_SLEEP 0x8
+#define DMPF_MIN_RECOVERY_RATE 0x10
+#define DMPF_MAX_RECOVERY_RATE 0x20
+#define DMPF_MAX_WRITE_BEHIND 0x40
+#define DMPF_STRIPE_CACHE 0x80
+#define DMPF_REGION_SIZE 0X100
struct raid_set {
struct dm_target *ti;
@@ -72,6 +73,7 @@ static struct raid_type {
const unsigned level; /* RAID level. */
const unsigned algorithm; /* RAID algorithm. */
} raid_types[] = {
+ {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
{"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
{"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
}
sectors_per_dev = ti->len;
- if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+ if ((raid_type->level > 1) &&
+ sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
ti->error = "Target length not divisible by number of data devices";
return ERR_PTR(-EINVAL);
}
@@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs)
{
int i;
- for (i = 0; i < rs->md.raid_disks; i++)
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (rs->dev[i].meta_dev)
+ dm_put_device(rs->ti, rs->dev[i].meta_dev);
+ if (rs->dev[i].rdev.sb_page)
+ put_page(rs->dev[i].rdev.sb_page);
+ rs->dev[i].rdev.sb_page = NULL;
+ rs->dev[i].rdev.sb_loaded = 0;
if (rs->dev[i].data_dev)
dm_put_device(rs->ti, rs->dev[i].data_dev);
+ }
kfree(rs);
}
@@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs)
* <meta_dev>: meta device name or '-' if missing
* <data_dev>: data device name or '-' if missing
*
- * This code parses those words.
+ * The following are permitted:
+ * - -
+ * - <data_dev>
+ * <meta_dev> <data_dev>
+ *
+ * The following is not allowed:
+ * <meta_dev> -
+ *
+ * This code parses those words. If there is a failure,
+ * the caller must use context_free to unwind the operations.
*/
static int dev_parms(struct raid_set *rs, char **argv)
{
@@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
rs->dev[i].rdev.mddev = &rs->md;
if (strcmp(argv[0], "-")) {
- rs->ti->error = "Metadata devices not supported";
- return -EINVAL;
+ ret = dm_get_device(rs->ti, argv[0],
+ dm_table_get_mode(rs->ti->table),
+ &rs->dev[i].meta_dev);
+ rs->ti->error = "RAID metadata device lookup failure";
+ if (ret)
+ return ret;
+
+ rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
+ if (!rs->dev[i].rdev.sb_page)
+ return -ENOMEM;
}
if (!strcmp(argv[1], "-")) {
@@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
return -EINVAL;
}
+ rs->ti->error = "No data device supplied with metadata device";
+ if (rs->dev[i].meta_dev)
+ return -EINVAL;
+
continue;
}
@@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
return ret;
}
+ if (rs->dev[i].meta_dev) {
+ metadata_available = 1;
+ rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
+ }
rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv)
}
/*
+ * validate_region_size
+ * @rs
+ * @region_size: region size in sectors. If 0, pick a size (4MiB default).
+ *
+ * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
+ * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_region_size(struct raid_set *rs, unsigned long region_size)
+{
+ unsigned long min_region_size = rs->ti->len / (1 << 21);
+
+ if (!region_size) {
+ /*
+ * Choose a reasonable default. All figures in sectors.
+ */
+ if (min_region_size > (1 << 13)) {
+ DMINFO("Choosing default region size of %lu sectors",
+ region_size);
+ region_size = min_region_size;
+ } else {
+ DMINFO("Choosing default region size of 4MiB");
+ region_size = 1 << 13; /* sectors */
+ }
+ } else {
+ /*
+ * Validate user-supplied value.
+ */
+ if (region_size > rs->ti->len) {
+ rs->ti->error = "Supplied region size is too large";
+ return -EINVAL;
+ }
+
+ if (region_size < min_region_size) {
+ DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
+ region_size, min_region_size);
+ rs->ti->error = "Supplied region size is too small";
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(region_size)) {
+ rs->ti->error = "Region size is not a power of 2";
+ return -EINVAL;
+ }
+
+ if (region_size < rs->md.chunk_sectors) {
+ rs->ti->error = "Region size is smaller than the chunk size";
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Convert sectors to bytes.
+ */
+ rs->md.bitmap_info.chunksize = (region_size << 9);
+
+ return 0;
+}
+
+/*
* Possible arguments are...
- * RAID456:
* <chunk_size> [optional_args]
*
- * Optional args:
- * [[no]sync] Force or prevent recovery of the entire array
+ * Argument definitions
+ * <chunk_size> The number of sectors per disk that
+ * will form the "stripe"
+ * [[no]sync] Force or prevent recovery of the
+ * entire array
* [rebuild <idx>] Rebuild the drive indicated by the index
- * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
+ * [daemon_sleep <ms>] Time between bitmap daemon work to
+ * clear bits
* [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
* [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [write_mostly <idx>] Indicate a write mostly drive via index
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs
+ * [region_size <sectors>] Defines granularity of bitmap
*/
static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params)
{
unsigned i, rebuild_cnt = 0;
- unsigned long value;
+ unsigned long value, region_size = 0;
char *key;
/*
* First, parse the in-order required arguments
+ * "chunk_size" is the only argument of this type.
*/
- if ((strict_strtoul(argv[0], 10, &value) < 0) ||
- !is_power_of_2(value) || (value < 8)) {
+ if ((strict_strtoul(argv[0], 10, &value) < 0)) {
rs->ti->error = "Bad chunk size";
return -EINVAL;
+ } else if (rs->raid_type->level == 1) {
+ if (value)
+ DMERR("Ignoring chunk size parameter for RAID 1");
+ value = 0;
+ } else if (!is_power_of_2(value)) {
+ rs->ti->error = "Chunk size must be a power of 2";
+ return -EINVAL;
+ } else if (value < 8) {
+ rs->ti->error = "Chunk size value is too small";
+ return -EINVAL;
}
rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
num_raid_params--;
/*
- * Second, parse the unordered optional arguments
+ * We set each individual device as In_sync with a completed
+ * 'recovery_offset'. If there has been a device failure or
+ * replacement then one of the following cases applies:
+ *
+ * 1) User specifies 'rebuild'.
+ * - Device is reset when param is read.
+ * 2) A new device is supplied.
+ * - No matching superblock found, resets device.
+ * 3) Device failure was transient and returns on reload.
+ * - Failure noticed, resets device for bitmap replay.
+ * 4) Device hadn't completed recovery after previous failure.
+ * - Superblock is read and overrides recovery_offset.
+ *
+ * What is found in the superblocks of the devices is always
+ * authoritative, unless 'rebuild' or '[no]sync' was specified.
*/
- for (i = 0; i < rs->md.raid_disks; i++)
+ for (i = 0; i < rs->md.raid_disks; i++) {
set_bit(In_sync, &rs->dev[i].rdev.flags);
+ rs->dev[i].rdev.recovery_offset = MaxSector;
+ }
+ /*
+ * Second, parse the unordered optional arguments
+ */
for (i = 0; i < num_raid_params; i++) {
- if (!strcmp(argv[i], "nosync")) {
+ if (!strcasecmp(argv[i], "nosync")) {
rs->md.recovery_cp = MaxSector;
rs->print_flags |= DMPF_NOSYNC;
- rs->md.flags |= MD_SYNC_STATE_FORCED;
continue;
}
- if (!strcmp(argv[i], "sync")) {
+ if (!strcasecmp(argv[i], "sync")) {
rs->md.recovery_cp = 0;
rs->print_flags |= DMPF_SYNC;
- rs->md.flags |= MD_SYNC_STATE_FORCED;
continue;
}
@@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
- if (!strcmp(key, "rebuild")) {
- if (++rebuild_cnt > rs->raid_type->parity_devs) {
- rs->ti->error = "Too many rebuild drives given";
+ if (!strcasecmp(key, "rebuild")) {
+ rebuild_cnt++;
+ if (((rs->raid_type->level != 1) &&
+ (rebuild_cnt > rs->raid_type->parity_devs)) ||
+ ((rs->raid_type->level == 1) &&
+ (rebuild_cnt > (rs->md.raid_disks - 1)))) {
+ rs->ti->error = "Too many rebuild devices specified for given RAID type";
return -EINVAL;
}
if (value > rs->md.raid_disks) {
@@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
clear_bit(In_sync, &rs->dev[value].rdev.flags);
rs->dev[value].rdev.recovery_offset = 0;
- } else if (!strcmp(key, "max_write_behind")) {
+ rs->print_flags |= DMPF_REBUILD;
+ } else if (!strcasecmp(key, "write_mostly")) {
+ if (rs->raid_type->level != 1) {
+ rs->ti->error = "write_mostly option is only valid for RAID1";
+ return -EINVAL;
+ }
+ if (value > rs->md.raid_disks) {
+ rs->ti->error = "Invalid write_mostly drive index given";
+ return -EINVAL;
+ }
+ set_bit(WriteMostly, &rs->dev[value].rdev.flags);
+ } else if (!strcasecmp(key, "max_write_behind")) {
+ if (rs->raid_type->level != 1) {
+ rs->ti->error = "max_write_behind option is only valid for RAID1";
+ return -EINVAL;
+ }
rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
/*
@@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
rs->md.bitmap_info.max_write_behind = value;
- } else if (!strcmp(key, "daemon_sleep")) {
+ } else if (!strcasecmp(key, "daemon_sleep")) {
rs->print_flags |= DMPF_DAEMON_SLEEP;
if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
rs->ti->error = "daemon sleep period out of range";
return -EINVAL;
}
rs->md.bitmap_info.daemon_sleep = value;
- } else if (!strcmp(key, "stripe_cache")) {
+ } else if (!strcasecmp(key, "stripe_cache")) {
rs->print_flags |= DMPF_STRIPE_CACHE;
/*
@@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "Bad stripe_cache size";
return -EINVAL;
}
- } else if (!strcmp(key, "min_recovery_rate")) {
+ } else if (!strcasecmp(key, "min_recovery_rate")) {
rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
if (value > INT_MAX) {
rs->ti->error = "min_recovery_rate out of range";
return -EINVAL;
}
rs->md.sync_speed_min = (int)value;
- } else if (!strcmp(key, "max_recovery_rate")) {
+ } else if (!strcasecmp(key, "max_recovery_rate")) {
rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
if (value > INT_MAX) {
rs->ti->error = "max_recovery_rate out of range";
return -EINVAL;
}
rs->md.sync_speed_max = (int)value;
+ } else if (!strcasecmp(key, "region_size")) {
+ rs->print_flags |= DMPF_REGION_SIZE;
+ region_size = value;
} else {
DMERR("Unable to parse RAID parameter: %s", key);
rs->ti->error = "Unable to parse RAID parameters";
@@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
}
+ if (validate_region_size(rs, region_size))
+ return -EINVAL;
+
+ if (rs->md.chunk_sectors)
+ rs->ti->split_io = rs->md.chunk_sectors;
+ else
+ rs->ti->split_io = region_size;
+
+ if (rs->md.chunk_sectors)
+ rs->ti->split_io = rs->md.chunk_sectors;
+ else
+ rs->ti->split_io = region_size;
+
/* Assume there are no metadata devices until the drives are parsed */
rs->md.persistent = 0;
rs->md.external = 1;
@@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
{
struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+ if (rs->raid_type->level == 1)
+ return md_raid1_congested(&rs->md, bits);
+
return md_raid5_congested(&rs->md, bits);
}
/*
+ * This structure is never routinely used by userspace, unlike md superblocks.
+ * Devices with this superblock should only ever be accessed via device-mapper.
+ */
+#define DM_RAID_MAGIC 0x64526D44
+struct dm_raid_superblock {
+ __le32 magic; /* "DmRd" */
+ __le32 features; /* Used to indicate possible future changes */
+
+ __le32 num_devices; /* Number of devices in this array. (Max 64) */
+ __le32 array_position; /* The position of this drive in the array */
+
+ __le64 events; /* Incremented by md when superblock updated */
+ __le64 failed_devices; /* Bit field of devices to indicate failures */
+
+ /*
+ * This offset tracks the progress of the repair or replacement of
+ * an individual drive.
+ */
+ __le64 disk_recovery_offset;
+
+ /*
+ * This offset tracks the progress of the initial array
+ * synchronisation/parity calculation.
+ */
+ __le64 array_resync_offset;
+
+ /*
+ * RAID characteristics
+ */
+ __le32 level;
+ __le32 layout;
+ __le32 stripe_sectors;
+
+ __u8 pad[452]; /* Round struct to 512 bytes. */
+ /* Always set to 0 when writing. */
+} __packed;
+
+static int read_disk_sb(mdk_rdev_t *rdev, int size)
+{
+ BUG_ON(!rdev->sb_page);
+
+ if (rdev->sb_loaded)
+ return 0;
+
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
+ DMERR("Failed to read device superblock");
+ return -EINVAL;
+ }
+
+ rdev->sb_loaded = 1;
+
+ return 0;
+}
+
+static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdk_rdev_t *r, *t;
+ uint64_t failed_devices;
+ struct dm_raid_superblock *sb;
+
+ sb = page_address(rdev->sb_page);
+ failed_devices = le64_to_cpu(sb->failed_devices);
+
+ rdev_for_each(r, t, mddev)
+ if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
+ failed_devices |= (1ULL << r->raid_disk);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->magic = cpu_to_le32(DM_RAID_MAGIC);
+ sb->features = cpu_to_le32(0); /* No features yet */
+
+ sb->num_devices = cpu_to_le32(mddev->raid_disks);
+ sb->array_position = cpu_to_le32(rdev->raid_disk);
+
+ sb->events = cpu_to_le64(mddev->events);
+ sb->failed_devices = cpu_to_le64(failed_devices);
+
+ sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+ sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+ sb->level = cpu_to_le32(mddev->level);
+ sb->layout = cpu_to_le32(mddev->layout);
+ sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+}
+
+/*
+ * super_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will decide which superblock to use if there's a choice.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
+{
+ int ret;
+ struct dm_raid_superblock *sb;
+ struct dm_raid_superblock *refsb;
+ uint64_t events_sb, events_refsb;
+
+ rdev->sb_start = 0;
+ rdev->sb_size = sizeof(*sb);
+
+ ret = read_disk_sb(rdev, rdev->sb_size);
+ if (ret)
+ return ret;
+
+ sb = page_address(rdev->sb_page);
+ if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+ super_sync(rdev->mddev, rdev);
+
+ set_bit(FirstUse, &rdev->flags);
+
+ /* Force writing of superblocks to disk */
+ set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+ /* Any superblock is better than none, choose that if given */
+ return refdev ? 0 : 1;
+ }
+
+ if (!refdev)
+ return 1;
+
+ events_sb = le64_to_cpu(sb->events);
+
+ refsb = page_address(refdev->sb_page);
+ events_refsb = le64_to_cpu(refsb->events);
+
+ return (events_sb > events_refsb) ? 1 : 0;
+}
+
+static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int role;
+ struct raid_set *rs = container_of(mddev, struct raid_set, md);
+ uint64_t events_sb;
+ uint64_t failed_devices;
+ struct dm_raid_superblock *sb;
+ uint32_t new_devs = 0;
+ uint32_t rebuilds = 0;
+ mdk_rdev_t *r, *t;
+ struct dm_raid_superblock *sb2;
+
+ sb = page_address(rdev->sb_page);
+ events_sb = le64_to_cpu(sb->events);
+ failed_devices = le64_to_cpu(sb->failed_devices);
+
+ /*
+ * Initialise to 1 if this is a new superblock.
+ */
+ mddev->events = events_sb ? : 1;
+
+ /*
+ * Reshaping is not currently allowed
+ */
+ if ((le32_to_cpu(sb->level) != mddev->level) ||
+ (le32_to_cpu(sb->layout) != mddev->layout) ||
+ (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
+ DMERR("Reshaping arrays not yet supported.");
+ return -EINVAL;
+ }
+
+ /* We can only change the number of devices in RAID1 right now */
+ if ((rs->raid_type->level != 1) &&
+ (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+ DMERR("Reshaping arrays not yet supported.");
+ return -EINVAL;
+ }
+
+ if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+ mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+ /*
+ * During load, we set FirstUse if a new superblock was written.
+ * There are two reasons we might not have a superblock:
+ * 1) The array is brand new - in which case, all of the
+ * devices must have their In_sync bit set. Also,
+ * recovery_cp must be 0, unless forced.
+ * 2) This is a new device being added to an old array
+ * and the new device needs to be rebuilt - in which
+ * case the In_sync bit will /not/ be set and
+ * recovery_cp must be MaxSector.
+ */
+ rdev_for_each(r, t, mddev) {
+ if (!test_bit(In_sync, &r->flags)) {
+ if (!test_bit(FirstUse, &r->flags))
+ DMERR("Superblock area of "
+ "rebuild device %d should have been "
+ "cleared.", r->raid_disk);
+ set_bit(FirstUse, &r->flags);
+ rebuilds++;
+ } else if (test_bit(FirstUse, &r->flags))
+ new_devs++;
+ }
+
+ if (!rebuilds) {
+ if (new_devs == mddev->raid_disks) {
+ DMINFO("Superblocks created for new array");
+ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+ } else if (new_devs) {
+ DMERR("New device injected "
+ "into existing array without 'rebuild' "
+ "parameter specified");
+ return -EINVAL;
+ }
+ } else if (new_devs) {
+ DMERR("'rebuild' devices cannot be "
+ "injected into an array with other first-time devices");
+ return -EINVAL;
+ } else if (mddev->recovery_cp != MaxSector) {
+ DMERR("'rebuild' specified while array is not in-sync");
+ return -EINVAL;
+ }
+
+ /*
+ * Now we set the Faulty bit for those devices that are
+ * recorded in the superblock as failed.
+ */
+ rdev_for_each(r, t, mddev) {
+ if (!r->sb_page)
+ continue;
+ sb2 = page_address(r->sb_page);
+ sb2->failed_devices = 0;
+
+ /*
+ * Check for any device re-ordering.
+ */
+ if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
+ role = le32_to_cpu(sb2->array_position);
+ if (role != r->raid_disk) {
+ if (rs->raid_type->level != 1) {
+ rs->ti->error = "Cannot change device "
+ "positions in RAID array";
+ return -EINVAL;
+ }
+ DMINFO("RAID1 device #%d now at position #%d",
+ role, r->raid_disk);
+ }
+
+ /*
+ * Partial recovery is performed on
+ * returning failed devices.
+ */
+ if (failed_devices & (1 << role))
+ set_bit(Faulty, &r->flags);
+ }
+ }
+
+ return 0;
+}
+
+static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+
+ /*
+ * If mddev->events is not set, we know we have not yet initialized
+ * the array.
+ */
+ if (!mddev->events && super_init_validation(mddev, rdev))
+ return -EINVAL;
+
+ mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+ rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+ if (!test_bit(FirstUse, &rdev->flags)) {
+ rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+ if (rdev->recovery_offset != MaxSector)
+ clear_bit(In_sync, &rdev->flags);
+ }
+
+ /*
+ * If a device comes back, set it as not In_sync and no longer faulty.
+ */
+ if (test_bit(Faulty, &rdev->flags)) {
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ rdev->saved_raid_disk = rdev->raid_disk;
+ rdev->recovery_offset = 0;
+ }
+
+ clear_bit(FirstUse, &rdev->flags);
+
+ return 0;
+}
+
+/*
+ * Analyse superblocks and select the freshest.
+ */
+static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
+{
+ int ret;
+ mdk_rdev_t *rdev, *freshest, *tmp;
+ mddev_t *mddev = &rs->md;
+
+ freshest = NULL;
+ rdev_for_each(rdev, tmp, mddev) {
+ if (!rdev->meta_bdev)
+ continue;
+
+ ret = super_load(rdev, freshest);
+
+ switch (ret) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ ti->error = "Failed to load superblock";
+ return ret;
+ }
+ }
+
+ if (!freshest)
+ return 0;
+
+ /*
+ * Validation of the freshest device provides the source of
+ * validation for the remaining devices.
+ */
+ ti->error = "Unable to assemble array: Invalid superblocks";
+ if (super_validate(mddev, freshest))
+ return -EINVAL;
+
+ rdev_for_each(rdev, tmp, mddev)
+ if ((rdev != freshest) && super_validate(mddev, rdev))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
* Construct a RAID4/5/6 mapping:
* Args:
* <raid_type> <#raid_params> <raid_params> \
* <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
*
- * ** metadata devices are not supported yet, use '-' instead **
- *
* <raid_params> varies by <raid_type>. See 'parse_raid_params' for
* details on possible <raid_params>.
*/
@@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (ret)
goto bad;
+ rs->md.sync_super = super_sync;
+ ret = analyse_superblocks(ti, rs);
+ if (ret)
+ goto bad;
+
INIT_WORK(&rs->md.event_work, do_table_event);
- ti->split_io = rs->md.chunk_sectors;
ti->private = rs;
mutex_lock(&rs->md.reconfig_mutex);
@@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+ mddev_suspend(&rs->md);
return 0;
bad:
@@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
/* The string you would use to construct this array */
- for (i = 0; i < rs->md.raid_disks; i++)
- if (rs->dev[i].data_dev &&
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if ((rs->print_flags & DMPF_REBUILD) &&
+ rs->dev[i].data_dev &&
!test_bit(In_sync, &rs->dev[i].rdev.flags))
- raid_param_cnt++; /* for rebuilds */
+ raid_param_cnt += 2; /* for rebuilds */
+ if (rs->dev[i].data_dev &&
+ test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+ raid_param_cnt += 2;
+ }
- raid_param_cnt += (hweight64(rs->print_flags) * 2);
+ raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
raid_param_cnt--;
@@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" nosync");
for (i = 0; i < rs->md.raid_disks; i++)
- if (rs->dev[i].data_dev &&
+ if ((rs->print_flags & DMPF_REBUILD) &&
+ rs->dev[i].data_dev &&
!test_bit(In_sync, &rs->dev[i].rdev.flags))
DMEMIT(" rebuild %u", i);
@@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev &&
+ test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+ DMEMIT(" write_mostly %u", i);
+
if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
DMEMIT(" max_write_behind %lu",
rs->md.bitmap_info.max_write_behind);
@@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type,
conf ? conf->max_nr_stripes * 2 : 0);
}
+ if (rs->print_flags & DMPF_REGION_SIZE)
+ DMEMIT(" region_size %lu",
+ rs->md.bitmap_info.chunksize >> 9);
+
DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) {
- DMEMIT(" -"); /* metadata device */
+ if (rs->dev[i].meta_dev)
+ DMEMIT(" %s", rs->dev[i].meta_dev->name);
+ else
+ DMEMIT(" -");
if (rs->dev[i].data_dev)
DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
+ bitmap_load(&rs->md);
mddev_resume(&rs->md);
}
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 135c2f1fdbf..d1f1d701710 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -58,25 +58,30 @@
#define NUM_SNAPSHOT_HDR_CHUNKS 1
struct disk_header {
- uint32_t magic;
+ __le32 magic;
/*
* Is this snapshot valid. There is no way of recovering
* an invalid snapshot.
*/
- uint32_t valid;
+ __le32 valid;
/*
* Simple, incrementing version. no backward
* compatibility.
*/
- uint32_t version;
+ __le32 version;
/* In sectors */
- uint32_t chunk_size;
-};
+ __le32 chunk_size;
+} __packed;
struct disk_exception {
+ __le64 old_chunk;
+ __le64 new_chunk;
+} __packed;
+
+struct core_exception {
uint64_t old_chunk;
uint64_t new_chunk;
};
@@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps)
if (!ps->area)
goto err_area;
- ps->zero_area = vmalloc(len);
+ ps->zero_area = vzalloc(len);
if (!ps->zero_area)
goto err_zero_area;
- memset(ps->zero_area, 0, len);
ps->header_area = vmalloc(len);
if (!ps->header_area)
@@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
}
static void read_exception(struct pstore *ps,
- uint32_t index, struct disk_exception *result)
+ uint32_t index, struct core_exception *result)
{
- struct disk_exception *e = get_exception(ps, index);
+ struct disk_exception *de = get_exception(ps, index);
/* copy it */
- result->old_chunk = le64_to_cpu(e->old_chunk);
- result->new_chunk = le64_to_cpu(e->new_chunk);
+ result->old_chunk = le64_to_cpu(de->old_chunk);
+ result->new_chunk = le64_to_cpu(de->new_chunk);
}
static void write_exception(struct pstore *ps,
- uint32_t index, struct disk_exception *de)
+ uint32_t index, struct core_exception *e)
{
- struct disk_exception *e = get_exception(ps, index);
+ struct disk_exception *de = get_exception(ps, index);
/* copy it */
- e->old_chunk = cpu_to_le64(de->old_chunk);
- e->new_chunk = cpu_to_le64(de->new_chunk);
+ de->old_chunk = cpu_to_le64(e->old_chunk);
+ de->new_chunk = cpu_to_le64(e->new_chunk);
}
static void clear_exception(struct pstore *ps, uint32_t index)
{
- struct disk_exception *e = get_exception(ps, index);
+ struct disk_exception *de = get_exception(ps, index);
/* clear it */
- e->old_chunk = 0;
- e->new_chunk = 0;
+ de->old_chunk = 0;
+ de->new_chunk = 0;
}
/*
@@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps,
{
int r;
unsigned int i;
- struct disk_exception de;
+ struct core_exception e;
/* presume the area is full */
*full = 1;
for (i = 0; i < ps->exceptions_per_area; i++) {
- read_exception(ps, i, &de);
+ read_exception(ps, i, &e);
/*
* If the new_chunk is pointing at the start of
@@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps,
* is we know that we've hit the end of the
* exceptions. Therefore the area is not full.
*/
- if (de.new_chunk == 0LL) {
+ if (e.new_chunk == 0LL) {
ps->current_committed = i;
*full = 0;
break;
@@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps,
/*
* Keep track of the start of the free chunks.
*/
- if (ps->next_free <= de.new_chunk)
- ps->next_free = de.new_chunk + 1;
+ if (ps->next_free <= e.new_chunk)
+ ps->next_free = e.new_chunk + 1;
/*
* Otherwise we add the exception to the snapshot.
*/
- r = callback(callback_context, de.old_chunk, de.new_chunk);
+ r = callback(callback_context, e.old_chunk, e.new_chunk);
if (r)
return r;
}
@@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
sizeof(struct disk_exception);
ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
- sizeof(*ps->callbacks));
+ sizeof(*ps->callbacks));
if (!ps->callbacks)
return -ENOMEM;
@@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store,
{
unsigned int i;
struct pstore *ps = get_info(store);
- struct disk_exception de;
+ struct core_exception ce;
struct commit_callback *cb;
- de.old_chunk = e->old_chunk;
- de.new_chunk = e->new_chunk;
- write_exception(ps, ps->current_committed++, &de);
+ ce.old_chunk = e->old_chunk;
+ ce.new_chunk = e->new_chunk;
+ write_exception(ps, ps->current_committed++, &ce);
/*
* Add the callback to the back of the array. This code
@@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
* If we completely filled the current area, then wipe the next one.
*/
if ((ps->current_committed == ps->exceptions_per_area) &&
- zero_disk_area(ps, ps->current_area + 1))
+ zero_disk_area(ps, ps->current_area + 1))
ps->valid = 0;
/*
@@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
chunk_t *last_new_chunk)
{
struct pstore *ps = get_info(store);
- struct disk_exception de;
+ struct core_exception ce;
int nr_consecutive;
int r;
@@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
ps->current_committed = ps->exceptions_per_area;
}
- read_exception(ps, ps->current_committed - 1, &de);
- *last_old_chunk = de.old_chunk;
- *last_new_chunk = de.new_chunk;
+ read_exception(ps, ps->current_committed - 1, &ce);
+ *last_old_chunk = ce.old_chunk;
+ *last_new_chunk = ce.new_chunk;
/*
* Find number of consecutive chunks within the current area,
@@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
nr_consecutive++) {
read_exception(ps, ps->current_committed - 1 - nr_consecutive,
- &de);
- if (de.old_chunk != *last_old_chunk - nr_consecutive ||
- de.new_chunk != *last_new_chunk - nr_consecutive)
+ &ce);
+ if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
+ ce.new_chunk != *last_new_chunk - nr_consecutive)
break;
}
@@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
for (i = 0; i < nr_merged; i++)
clear_exception(ps, ps->current_committed - 1 - i);
- r = area_io(ps, WRITE);
+ r = area_io(ps, WRITE_FLUSH_FUA);
if (r < 0)
return r;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9ecff5f3023..6f758870fc1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
((ti)->type->name == dm_snapshot_merge_target_name)
/*
- * The percentage increment we will wake up users at
- */
-#define WAKE_UP_PERCENT 5
-
-/*
- * kcopyd priority of snapshot operations
- */
-#define SNAPSHOT_COPY_PRIORITY 2
-
-/*
* The size of the mempool used to track chunks in use.
*/
#define MIN_IOS 256
@@ -180,6 +170,13 @@ struct dm_snap_pending_exception {
* kcopyd.
*/
int started;
+
+ /*
+ * For writing a complete chunk, bypassing the copy.
+ */
+ struct bio *full_bio;
+ bio_end_io_t *full_bio_end_io;
+ void *full_bio_private;
};
/*
@@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
- ti->error = "Cannot allocate snapshot context private "
- "structure";
+ ti->error = "Cannot allocate private snapshot structure";
r = -ENOMEM;
goto bad;
}
@@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
struct dm_snapshot *s = pe->snap;
struct bio *origin_bios = NULL;
struct bio *snapshot_bios = NULL;
+ struct bio *full_bio = NULL;
int error = 0;
if (!success) {
@@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
*/
dm_insert_exception(&s->complete, e);
- out:
+out:
dm_remove_exception(&pe->e);
snapshot_bios = bio_list_get(&pe->snapshot_bios);
origin_bios = bio_list_get(&pe->origin_bios);
+ full_bio = pe->full_bio;
+ if (full_bio) {
+ full_bio->bi_end_io = pe->full_bio_end_io;
+ full_bio->bi_private = pe->full_bio_private;
+ }
free_pending_exception(pe);
increment_pending_exceptions_done_count();
@@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
up_write(&s->lock);
/* Submit any pending write bios */
- if (error)
+ if (error) {
+ if (full_bio)
+ bio_io_error(full_bio);
error_bios(snapshot_bios);
- else
+ } else {
+ if (full_bio)
+ bio_endio(full_bio, 0);
flush_bios(snapshot_bios);
+ }
retry_origin_bios(s, origin_bios);
}
@@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dest.count = src.count;
/* Hand over to kcopyd */
- dm_kcopyd_copy(s->kcopyd_client,
- &src, 1, &dest, 0, copy_callback, pe);
+ dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
+}
+
+static void full_bio_end_io(struct bio *bio, int error)
+{
+ void *callback_data = bio->bi_private;
+
+ dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+}
+
+static void start_full_bio(struct dm_snap_pending_exception *pe,
+ struct bio *bio)
+{
+ struct dm_snapshot *s = pe->snap;
+ void *callback_data;
+
+ pe->full_bio = bio;
+ pe->full_bio_end_io = bio->bi_end_io;
+ pe->full_bio_private = bio->bi_private;
+
+ callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
+ copy_callback, pe);
+
+ bio->bi_end_io = full_bio_end_io;
+ bio->bi_private = callback_data;
+
+ generic_make_request(bio);
}
static struct dm_snap_pending_exception *
@@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s,
bio_list_init(&pe->origin_bios);
bio_list_init(&pe->snapshot_bios);
pe->started = 0;
+ pe->full_bio = NULL;
if (s->store->type->prepare_exception(s->store, &pe->e)) {
free_pending_exception(pe);
@@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
}
remap_exception(s, &pe->e, bio, chunk);
- bio_list_add(&pe->snapshot_bios, bio);
r = DM_MAPIO_SUBMITTED;
+ if (!pe->started &&
+ bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+ pe->started = 1;
+ up_write(&s->lock);
+ start_full_bio(pe, bio);
+ goto out;
+ }
+
+ bio_list_add(&pe->snapshot_bios, bio);
+
if (!pe->started) {
/* this is protected by snap->lock */
pe->started = 1;
@@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
map_context->ptr = track_chunk(s, chunk);
}
- out_unlock:
+out_unlock:
up_write(&s->lock);
- out:
+out:
return r;
}
@@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
pe_to_start_now = pe;
}
- next_snapshot:
+next_snapshot:
up_write(&snap->lock);
if (pe_to_start_now) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 451c3bb176d..986b8754bb0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -17,7 +17,7 @@
#include <linux/interrupt.h>
#include <linux/mutex.h>
#include <linux/delay.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#define DM_MSG_PREFIX "table"
@@ -54,7 +54,6 @@ struct dm_table {
sector_t *highs;
struct dm_target *targets;
- unsigned discards_supported:1;
unsigned integrity_supported:1;
/*
@@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
return NULL;
size = nmemb * elem_size;
- addr = vmalloc(size);
- if (addr)
- memset(addr, 0, size);
+ addr = vzalloc(size);
return addr;
}
+EXPORT_SYMBOL(dm_vcalloc);
/*
* highs, and targets are managed as dynamic arrays during a
@@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
INIT_LIST_HEAD(&t->devices);
INIT_LIST_HEAD(&t->target_callbacks);
atomic_set(&t->holders, 0);
- t->discards_supported = 1;
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t)
{
atomic_inc(&t->holders);
}
+EXPORT_SYMBOL(dm_table_get);
void dm_table_put(struct dm_table *t)
{
@@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t)
smp_mb__before_atomic_dec();
atomic_dec(&t->holders);
}
+EXPORT_SYMBOL(dm_table_put);
/*
* Checks to see if we need to extend highs or targets.
@@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
* Add a device to the list, or just increment the usage count if
* it's already present.
*/
-static int __table_get_device(struct dm_table *t, struct dm_target *ti,
- const char *path, fmode_t mode, struct dm_dev **result)
+int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+ struct dm_dev **result)
{
int r;
dev_t uninitialized_var(dev);
struct dm_dev_internal *dd;
unsigned int major, minor;
+ struct dm_table *t = ti->table;
BUG_ON(!t);
@@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
*result = &dd->dm_dev;
return 0;
}
+EXPORT_SYMBOL(dm_get_device);
int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
@@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
* If not we'll force DM to use PAGE_SIZE or
* smaller I/O, just to be safe.
*/
-
- if (q->merge_bvec_fn && !ti->type->merge)
+ if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
blk_limits_max_hw_sectors(limits,
(unsigned int) (PAGE_SIZE >> 9));
return 0;
}
EXPORT_SYMBOL_GPL(dm_set_device_limits);
-int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
- struct dm_dev **result)
-{
- return __table_get_device(ti->table, ti, path, mode, result);
-}
-
-
/*
- * Decrement a devices use count and remove it if necessary.
+ * Decrement a device's use count and remove it if necessary.
*/
void dm_put_device(struct dm_target *ti, struct dm_dev *d)
{
@@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
kfree(dd);
}
}
+EXPORT_SYMBOL(dm_put_device);
/*
* Checks to see if the target joins onto the end of the table.
@@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
- if (!tgt->num_discard_requests)
- t->discards_supported = 0;
+ if (!tgt->num_discard_requests && tgt->discards_supported)
+ DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
+ dm_device_name(t->md), type);
return 0;
@@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type,
return r;
}
+/*
+ * Target argument parsing helpers.
+ */
+static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+ unsigned *value, char **error, unsigned grouped)
+{
+ const char *arg_str = dm_shift_arg(arg_set);
+
+ if (!arg_str ||
+ (sscanf(arg_str, "%u", value) != 1) ||
+ (*value < arg->min) ||
+ (*value > arg->max) ||
+ (grouped && arg_set->argc < *value)) {
+ *error = arg->error;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+ unsigned *value, char **error)
+{
+ return validate_next_arg(arg, arg_set, value, error, 0);
+}
+EXPORT_SYMBOL(dm_read_arg);
+
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+ unsigned *value, char **error)
+{
+ return validate_next_arg(arg, arg_set, value, error, 1);
+}
+EXPORT_SYMBOL(dm_read_arg_group);
+
+const char *dm_shift_arg(struct dm_arg_set *as)
+{
+ char *r;
+
+ if (as->argc) {
+ as->argc--;
+ r = *as->argv;
+ as->argv++;
+ return r;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(dm_shift_arg);
+
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
+{
+ BUG_ON(as->argc < num_args);
+ as->argc -= num_args;
+ as->argv += num_args;
+}
+EXPORT_SYMBOL(dm_consume_args);
+
static int dm_table_set_type(struct dm_table *t)
{
unsigned i;
@@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t)
t->event_fn(t->event_context);
mutex_unlock(&_event_lock);
}
+EXPORT_SYMBOL(dm_table_event);
sector_t dm_table_get_size(struct dm_table *t)
{
return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
}
+EXPORT_SYMBOL(dm_table_get_size);
struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
{
@@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t)
blk_get_integrity(template_disk));
}
+static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ unsigned flush = (*(unsigned *)data);
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && (q->flush_flags & flush);
+}
+
+static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ /*
+ * Require at least one underlying device to support flushes.
+ * t->devices includes internal dm devices such as mirror logs
+ * so we need to use iterate_devices here, which targets
+ * supporting flushes must provide.
+ */
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_flush_requests)
+ continue;
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_flush_capable, &flush))
+ return 1;
+ }
+
+ return 0;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
+ unsigned flush = 0;
+
/*
* Copy table's limits to the DM device's request_queue
*/
@@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ if (dm_table_supports_flush(t, REQ_FLUSH)) {
+ flush |= REQ_FLUSH;
+ if (dm_table_supports_flush(t, REQ_FUA))
+ flush |= REQ_FUA;
+ }
+ blk_queue_flush(q, flush);
+
dm_table_set_integrity(t);
/*
@@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t)
{
return t->mode;
}
+EXPORT_SYMBOL(dm_table_get_mode);
static void suspend_targets(struct dm_table *t, unsigned postsuspend)
{
@@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
{
return t->md;
}
+EXPORT_SYMBOL(dm_table_get_md);
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
@@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t)
struct dm_target *ti;
unsigned i = 0;
- if (!t->discards_supported)
- return 0;
-
/*
* Unless any target used by the table set discards_supported,
* require at least one underlying device to support discards.
* t->devices includes internal dm devices such as mirror logs
* so we need to use iterate_devices here, which targets
- * supporting discard must provide.
+ * supporting discard selectively must provide.
*/
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
+ if (!ti->num_discard_requests)
+ continue;
+
if (ti->discards_supported)
return 1;
@@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t)
return 0;
}
-
-EXPORT_SYMBOL(dm_vcalloc);
-EXPORT_SYMBOL(dm_get_device);
-EXPORT_SYMBOL(dm_put_device);
-EXPORT_SYMBOL(dm_table_event);
-EXPORT_SYMBOL(dm_table_get_size);
-EXPORT_SYMBOL(dm_table_get_mode);
-EXPORT_SYMBOL(dm_table_get_md);
-EXPORT_SYMBOL(dm_table_put);
-EXPORT_SYMBOL(dm_table_get);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0cf68b47887..52b39f335bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME;
static unsigned int major = 0;
static unsigned int _major = 0;
+static DEFINE_IDR(_minor_idr);
+
static DEFINE_SPINLOCK(_minor_lock);
/*
* For bio-based dm.
@@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_MERGE_IS_OPTIONAL 6
/*
* Work processed by per-device workqueue.
@@ -313,6 +316,12 @@ static void __exit dm_exit(void)
while (i--)
_exits[i]();
+
+ /*
+ * Should be empty by this point.
+ */
+ idr_remove_all(&_minor_idr);
+ idr_destroy(&_minor_idr);
}
/*
@@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci)
/*
* Even though the device advertised discard support,
- * reconfiguration might have changed that since the
+ * that does not mean every target supports it, and
+ * reconfiguration might also have changed that since the
* check was performed.
*/
if (!ti->num_discard_requests)
@@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
/*-----------------------------------------------------------------
* An IDR is used to keep track of allocated minor numbers.
*---------------------------------------------------------------*/
-static DEFINE_IDR(_minor_idr);
-
static void free_minor(int minor)
{
spin_lock(&_minor_lock);
@@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md)
blk_queue_make_request(md->queue, dm_request);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
- blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
}
/*
@@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size)
}
/*
+ * Return 1 if the queue has a compulsory merge_bvec_fn function.
+ *
+ * If this function returns 0, then the device is either a non-dm
+ * device without a merge_bvec_fn, or it is a dm device that is
+ * able to split any bios it receives that are too big.
+ */
+int dm_queue_merge_is_compulsory(struct request_queue *q)
+{
+ struct mapped_device *dev_md;
+
+ if (!q->merge_bvec_fn)
+ return 0;
+
+ if (q->make_request_fn == dm_request) {
+ dev_md = q->queuedata;
+ if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dm_device_merge_is_compulsory(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return dm_queue_merge_is_compulsory(q);
+}
+
+/*
+ * Return 1 if it is acceptable to ignore merge_bvec_fn based
+ * on the properties of the underlying devices.
+ */
+static int dm_table_merge_is_optional(struct dm_table *table)
+{
+ unsigned i = 0;
+ struct dm_target *ti;
+
+ while (i < dm_table_get_num_targets(table)) {
+ ti = dm_table_get_target(table, i++);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct request_queue *q = md->queue;
sector_t size;
unsigned long flags;
+ int merge_is_optional;
size = dm_table_get_size(t);
@@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
__bind_mempools(md, t);
+ merge_is_optional = dm_table_merge_is_optional(t);
+
write_lock_irqsave(&md->map_lock, flags);
old_map = md->map;
md->map = t;
dm_table_set_restrictions(t, q, limits);
+ if (merge_is_optional)
+ set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
+ else
+ clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
write_unlock_irqrestore(&md->map_lock, flags);
return old_map;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1aaf16746da..6745dbd278a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+int dm_queue_merge_is_compulsory(struct request_queue *q);
+
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
void dm_set_md_type(struct mapped_device *md, unsigned type);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dfc9425db70..8e221a20f5d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
}
EXPORT_SYMBOL_GPL(bio_clone_mddev);
+void md_trim_bio(struct bio *bio, int offset, int size)
+{
+ /* 'bio' is a cloned bio which we need to trim to match
+ * the given offset and size.
+ * This requires adjusting bi_sector, bi_size, and bi_io_vec
+ */
+ int i;
+ struct bio_vec *bvec;
+ int sofar = 0;
+
+ size <<= 9;
+ if (offset == 0 && size == bio->bi_size)
+ return;
+
+ bio->bi_sector += offset;
+ bio->bi_size = size;
+ offset <<= 9;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+ while (bio->bi_idx < bio->bi_vcnt &&
+ bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
+ /* remove this whole bio_vec */
+ offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
+ bio->bi_idx++;
+ }
+ if (bio->bi_idx < bio->bi_vcnt) {
+ bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
+ bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
+ }
+ /* avoid any complications with bi_idx being non-zero*/
+ if (bio->bi_idx) {
+ memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+ (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+ bio->bi_vcnt -= bio->bi_idx;
+ bio->bi_idx = 0;
+ }
+ /* Make sure vcnt and last bv are not too big */
+ bio_for_each_segment(bvec, bio, i) {
+ if (sofar + bvec->bv_len > size)
+ bvec->bv_len = size - sofar;
+ if (bvec->bv_len == 0) {
+ bio->bi_vcnt = i;
+ break;
+ }
+ sofar += bvec->bv_len;
+ }
+}
+EXPORT_SYMBOL_GPL(md_trim_bio);
+
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
rdev->sb_start = 0;
rdev->sectors = 0;
}
+ if (rdev->bb_page) {
+ put_page(rdev->bb_page);
+ rdev->bb_page = NULL;
+ }
}
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
ret = -EINVAL;
bdevname(rdev->bdev, b);
- sb = (mdp_super_t*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
if (sb->md_magic != MD_SB_MAGIC) {
printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
rdev->preferred_minor = sb->md_minor;
rdev->data_offset = 0;
rdev->sb_size = MD_SB_BYTES;
+ rdev->badblocks.shift = -1;
if (sb->level == LEVEL_MULTIPATH)
rdev->desc_nr = -1;
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
ret = 1;
} else {
__u64 ev1, ev2;
- mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ mdp_super_t *refsb = page_address(refdev->sb_page);
if (!uuid_equal(refsb, sb)) {
printk(KERN_WARNING "md: %s has different UUID to %s\n",
b, bdevname(refdev->bdev,b2));
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
mdp_disk_t *desc;
- mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+ mdp_super_t *sb = page_address(rdev->sb_page);
__u64 ev1 = md_event(sb);
rdev->raid_disk = -1;
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->sb_size = MD_SB_BYTES;
- sb = (mdp_super_t*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
memset(sb, 0, sizeof(*sb));
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
return cpu_to_le32(csum);
}
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged);
static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (ret) return ret;
- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
else
rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ if (!rdev->bb_page) {
+ rdev->bb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->bb_page)
+ return -ENOMEM;
+ }
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
+ rdev->badblocks.count == 0) {
+ /* need to load the bad block list.
+ * Currently we limit it to one page.
+ */
+ s32 offset;
+ sector_t bb_sector;
+ u64 *bbp;
+ int i;
+ int sectors = le16_to_cpu(sb->bblog_size);
+ if (sectors > (PAGE_SIZE / 512))
+ return -EINVAL;
+ offset = le32_to_cpu(sb->bblog_offset);
+ if (offset == 0)
+ return -EINVAL;
+ bb_sector = (long long)offset;
+ if (!sync_page_io(rdev, bb_sector, sectors << 9,
+ rdev->bb_page, READ, true))
+ return -EIO;
+ bbp = (u64 *)page_address(rdev->bb_page);
+ rdev->badblocks.shift = sb->bblog_shift;
+ for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
+ u64 bb = le64_to_cpu(*bbp);
+ int count = bb & (0x3ff);
+ u64 sector = bb >> 10;
+ sector <<= sb->bblog_shift;
+ count <<= sb->bblog_shift;
+ if (bb + 1 == 0)
+ break;
+ if (md_set_badblocks(&rdev->badblocks,
+ sector, count, 1) == 0)
+ return -EINVAL;
+ }
+ } else if (sb->bblog_offset == 0)
+ rdev->badblocks.shift = -1;
+
if (!refdev) {
ret = 1;
} else {
__u64 ev1, ev2;
- struct mdp_superblock_1 *refsb =
- (struct mdp_superblock_1*)page_address(refdev->sb_page);
+ struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
sb->level != refsb->level ||
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
- struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events);
rdev->raid_disk = -1;
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
int max_dev, i;
/* make rdev->sb match mddev and rdev data. */
- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
sb->feature_map = 0;
sb->pad0 = 0;
sb->recovery_offset = cpu_to_le64(0);
memset(sb->pad1, 0, sizeof(sb->pad1));
- memset(sb->pad2, 0, sizeof(sb->pad2));
memset(sb->pad3, 0, sizeof(sb->pad3));
sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
}
+ if (rdev->badblocks.count == 0)
+ /* Nothing to do for bad blocks*/ ;
+ else if (sb->bblog_offset == 0)
+ /* Cannot record bad blocks on this device */
+ md_error(mddev, rdev);
+ else {
+ struct badblocks *bb = &rdev->badblocks;
+ u64 *bbp = (u64 *)page_address(rdev->bb_page);
+ u64 *p = bb->page;
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+ if (bb->changed) {
+ unsigned seq;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ memset(bbp, 0xff, PAGE_SIZE);
+
+ for (i = 0 ; i < bb->count ; i++) {
+ u64 internal_bb = *p++;
+ u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
+ | BB_LEN(internal_bb));
+ *bbp++ = cpu_to_le64(store_bb);
+ }
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ bb->sector = (rdev->sb_start +
+ (int)le32_to_cpu(sb->bblog_offset));
+ bb->size = le16_to_cpu(sb->bblog_size);
+ bb->changed = 0;
+ }
+ }
+
max_dev = 0;
list_for_each_entry(rdev2, &mddev->disks, same_set)
if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
}
- sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
sb->super_offset = rdev->sb_start;
sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
/* May as well allow recovery to be retried once */
- mddev->recovery_disabled = 0;
+ mddev->recovery_disabled++;
return 0;
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
rdev->sysfs_state = NULL;
+ kfree(rdev->badblocks.page);
+ rdev->badblocks.count = 0;
+ rdev->badblocks.page = NULL;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
switch (major_version) {
case 0:
- print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
+ print_sb_90(page_address(rdev->sb_page));
break;
case 1:
- print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
+ print_sb_1(page_address(rdev->sb_page));
break;
}
} else
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
mdk_rdev_t *rdev;
int sync_req;
int nospares = 0;
+ int any_badblocks_changed = 0;
repeat:
/* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2341,18 @@ repeat:
if (!mddev->persistent) {
clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
- if (!mddev->external)
+ if (!mddev->external) {
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->badblocks.changed) {
+ md_ack_all_badblocks(&rdev->badblocks);
+ md_error(mddev, rdev);
+ }
+ clear_bit(Blocked, &rdev->flags);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ }
+ }
wake_up(&mddev->sb_wait);
return;
}
@@ -2265,6 +2408,14 @@ repeat:
MD_BUG();
mddev->events --;
}
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->badblocks.changed)
+ any_badblocks_changed++;
+ if (test_bit(Faulty, &rdev->flags))
+ set_bit(FaultRecorded, &rdev->flags);
+ }
+
sync_sbs(mddev, nospares);
spin_unlock_irq(&mddev->write_lock);
@@ -2290,6 +2441,13 @@ repeat:
bdevname(rdev->bdev,b),
(unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
+ if (rdev->badblocks.size) {
+ md_super_write(mddev, rdev,
+ rdev->badblocks.sector,
+ rdev->badblocks.size << 9,
+ rdev->bb_page);
+ rdev->badblocks.size = 0;
+ }
} else
dprintk(")\n");
@@ -2313,6 +2471,15 @@ repeat:
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+ clear_bit(Blocked, &rdev->flags);
+
+ if (any_badblocks_changed)
+ md_ack_all_badblocks(&rdev->badblocks);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ }
}
/* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
char *sep = "";
size_t len = 0;
- if (test_bit(Faulty, &rdev->flags)) {
+ if (test_bit(Faulty, &rdev->flags) ||
+ rdev->badblocks.unacked_exist) {
len+= sprintf(page+len, "%sfaulty",sep);
sep = ",";
}
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
}
- if (test_bit(Blocked, &rdev->flags)) {
+ if (test_bit(Blocked, &rdev->flags) ||
+ rdev->badblocks.unacked_exist) {
len += sprintf(page+len, "%sblocked", sep);
sep = ",";
}
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page)
len += sprintf(page+len, "%sspare", sep);
sep = ",";
}
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ len += sprintf(page+len, "%swrite_error", sep);
+ sep = ",";
+ }
return len+sprintf(page+len, "\n");
}
@@ -2375,13 +2548,15 @@ static ssize_t
state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
/* can write
- * faulty - simulates and error
+ * faulty - simulates an error
* remove - disconnects the device
* writemostly - sets write_mostly
* -writemostly - clears write_mostly
- * blocked - sets the Blocked flag
- * -blocked - clears the Blocked flag
+ * blocked - sets the Blocked flags
+ * -blocked - clears the Blocked and possibly simulates an error
* insync - sets Insync providing device isn't active
+ * write_error - sets WriteErrorSeen
+ * -write_error - clears WriteErrorSeen
*/
int err = -EINVAL;
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
set_bit(Blocked, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "-blocked")) {
+ if (!test_bit(Faulty, &rdev->flags) &&
+ test_bit(BlockedBadBlocks, &rdev->flags)) {
+ /* metadata handler doesn't understand badblocks,
+ * so we need to fail the device
+ */
+ md_error(rdev->mddev, rdev);
+ }
clear_bit(Blocked, &rdev->flags);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
+ } else if (cmd_match(buf, "write_error")) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-write_error")) {
+ clear_bit(WriteErrorSeen, &rdev->flags);
+ err = 0;
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
char *e;
int err;
- char nm[20];
int slot = simple_strtoul(buf, &e, 10);
if (strncmp(buf, "none", 4)==0)
slot = -1;
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
hot_remove_disk(rdev->mddev, rdev->raid_disk);
if (err)
return err;
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&rdev->mddev->kobj, nm);
+ sysfs_unlink_rdev(rdev->mddev, rdev);
rdev->raid_disk = -1;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+ if (sysfs_link_rdev(rdev->mddev, rdev))
/* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack);
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
+
+static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
+{
+ return badblocks_show(&rdev->badblocks, page, 0);
+}
+static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+{
+ int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+ /* Maybe that ack was all we needed */
+ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+ wake_up(&rdev->blocked_wait);
+ return rv;
+}
+static struct rdev_sysfs_entry rdev_bad_blocks =
+__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
+
+
+static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
+{
+ return badblocks_show(&rdev->badblocks, page, 1);
+}
+static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+{
+ return badblocks_store(&rdev->badblocks, page, len, 1);
+}
+static struct rdev_sysfs_entry rdev_unack_bad_blocks =
+__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_offset.attr,
&rdev_size.attr,
&rdev_recovery_start.attr,
+ &rdev_bad_blocks.attr,
+ &rdev_unack_bad_blocks.attr,
NULL,
};
static ssize_t
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = {
.default_attrs = rdev_default_attrs,
};
-void md_rdev_init(mdk_rdev_t *rdev)
+int md_rdev_init(mdk_rdev_t *rdev)
{
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
rdev->sb_events = 0;
rdev->last_read_error.tv_sec = 0;
rdev->last_read_error.tv_nsec = 0;
+ rdev->sb_loaded = 0;
+ rdev->bb_page = NULL;
atomic_set(&rdev->nr_pending, 0);
atomic_set(&rdev->read_errors, 0);
atomic_set(&rdev->corrected_errors, 0);
INIT_LIST_HEAD(&rdev->same_set);
init_waitqueue_head(&rdev->blocked_wait);
+
+ /* Add space to store bad block list.
+ * This reserves the space even on arrays where it cannot
+ * be used - I wonder if that matters
+ */
+ rdev->badblocks.count = 0;
+ rdev->badblocks.shift = 0;
+ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ seqlock_init(&rdev->badblocks.lock);
+ if (rdev->badblocks.page == NULL)
+ return -ENOMEM;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/*
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
return ERR_PTR(-ENOMEM);
}
- md_rdev_init(rdev);
- if ((err = alloc_disk_sb(rdev)))
+ err = md_rdev_init(rdev);
+ if (err)
+ goto abort_free;
+ err = alloc_disk_sb(rdev);
+ if (err)
goto abort_free;
err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
goto abort_free;
}
}
+ if (super_format == -1)
+ /* hot-add for 0.90, or non-persistent: so no badblocks */
+ rdev->badblocks.shift = -1;
return rdev;
abort_free:
- if (rdev->sb_page) {
- if (rdev->bdev)
- unlock_rdev(rdev);
- free_disk_sb(rdev);
- }
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ kfree(rdev->badblocks.page);
kfree(rdev);
return ERR_PTR(err);
}
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
}
list_for_each_entry(rdev, &mddev->disks, same_set) {
- char nm[20];
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk >= mddev->raid_disks)
rdev->new_raid_disk = -1;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ sysfs_unlink_rdev(mddev, rdev);
}
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk < 0)
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
if (rdev->raid_disk < 0)
clear_bit(In_sync, &rdev->flags);
else {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
- printk("md: cannot register %s for %s after level change\n",
- nm, mdname(mddev));
+ if (sysfs_link_rdev(mddev, rdev))
+ printk(KERN_WARNING "md: cannot register rd%d"
+ " for %s after level change\n",
+ rdev->raid_disk, mdname(mddev));
}
}
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev)
}
if (mddev->bio_set == NULL)
- mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE,
+ sizeof(mddev_t *));
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev)
smp_wmb();
mddev->ready = 1;
list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk >= 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+ if (rdev->raid_disk >= 0)
+ if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
- }
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
sysfs_notify_dirent_safe(mddev->sysfs_state);
list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk >= 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
- }
+ if (rdev->raid_disk >= 0)
+ sysfs_unlink_rdev(mddev, rdev);
set_capacity(disk, 0);
mutex_unlock(&mddev->open_mutex);
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!rdev || test_bit(Faulty, &rdev->flags))
return;
- if (mddev->external)
- set_bit(Blocked, &rdev->flags);
-/*
- dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
- mdname(mddev),
- MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
- __builtin_return_address(0),__builtin_return_address(1),
- __builtin_return_address(2),__builtin_return_address(3));
-*/
- if (!mddev->pers)
- return;
- if (!mddev->pers->error_handler)
+ if (!mddev->pers || !mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
if (mddev->degraded)
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev)
atomic_add(sectors, &mddev->recovery_active);
}
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ break;
+
j += sectors;
if (j>1) mddev->curr_resync = j;
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
- /* this is the earliers that rebuilt will be
+ /* this is the earliest that rebuild will be
* visible in /proc/mdstat
*/
md_new_event(mddev);
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev)
continue;
last_check = io_sectors;
-
- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
- break;
-
repeat:
if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
/* step marks */
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev)
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) {
- char nm[20];
- sprintf(nm,"rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
}
- if (mddev->degraded && !mddev->recovery_disabled) {
+ if (mddev->degraded) {
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(Blocked, &rdev->flags))
+ !test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&mddev->kobj,
- &rdev->kobj, nm))
+ if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++;
md_new_event(mddev);
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event(mddev);
+ if (mddev->event_work.func)
+ queue_work(md_misc_wq, &mddev->event_work);
}
/*
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev)
if (mddev->bitmap)
bitmap_daemon_work(mddev);
- if (mddev->ro)
- return;
-
if (signal_pending(current)) {
if (mddev->pers->sync_request && !mddev->external) {
printk(KERN_INFO "md: %s in immediate safe mode\n",
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev)
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) {
- char nm[20];
- sprintf(nm,"rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
}
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
{
sysfs_notify_dirent_safe(rdev->sysfs_state);
wait_event_timeout(rdev->blocked_wait,
- !test_bit(Blocked, &rdev->flags),
+ !test_bit(Blocked, &rdev->flags) &&
+ !test_bit(BlockedBadBlocks, &rdev->flags),
msecs_to_jiffies(5000));
rdev_dec_pending(rdev, mddev);
}
EXPORT_SYMBOL(md_wait_for_blocked_rdev);
+
+/* Bad block management.
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so md_is_badblock
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ * We return
+ * 0 if there are no known bad blocks in the range
+ * 1 if there are known bad block which are all acknowledged
+ * -1 if there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int hi;
+ int lo = 0;
+ u64 *p = bb->page;
+ int rv = 0;
+ sector_t target = s + sectors;
+ unsigned seq;
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ s >>= bb->shift;
+ target += (1<<bb->shift) - 1;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+ /* 'target' is now the first block after the bad range */
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ hi = bb->count;
+
+ /* Binary search between lo and hi for 'target'
+ * i.e. for the last range that starts before 'target'
+ */
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+ * are known not to be the last range before target.
+ * VARIANT: hi-lo is the number of possible
+ * ranges, and decreases until it reaches 1
+ */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+ if (a < target)
+ /* This could still be the one, earlier ranges
+ * could not. */
+ lo = mid;
+ else
+ /* This and later ranges are definitely out. */
+ hi = mid;
+ }
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
+ if (hi > lo) {
+ /* need to check all range that end after 's' to see if
+ * any are unacknowledged.
+ */
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ if (BB_OFFSET(p[lo]) < target) {
+ /* starts before the end, and finishes after
+ * the start, so they must overlap
+ */
+ if (rv != -1 && BB_ACK(p[lo]))
+ rv = 1;
+ else
+ rv = -1;
+ *first_bad = BB_OFFSET(p[lo]);
+ *bad_sectors = BB_LEN(p[lo]);
+ }
+ lo--;
+ }
+ }
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(md_is_badblock);
+
+/*
+ * Add a range of bad blocks to the table.
+ * This might extend the table, or might contract it
+ * if two adjacent ranges can be merged.
+ * We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ */
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ u64 *p;
+ int lo, hi;
+ int rv = 1;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 0;
+
+ if (bb->shift) {
+ /* round the start down, and the end up */
+ sector_t next = s + sectors;
+ s >>= bb->shift;
+ next += (1<<bb->shift) - 1;
+ next >>= bb->shift;
+ sectors = next - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts at-or-before 's' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+ if (a <= s)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
+ hi = lo;
+
+ if (hi > lo) {
+ /* we found a range that might merge with the start
+ * of our new range
+ */
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t e = a + BB_LEN(p[lo]);
+ int ack = BB_ACK(p[lo]);
+ if (e >= s) {
+ /* Yes, we can merge with a previous range */
+ if (s == a && s + sectors >= e)
+ /* new range covers old */
+ ack = acknowledged;
+ else
+ ack = ack && acknowledged;
+
+ if (e < s + sectors)
+ e = s + sectors;
+ if (e - a <= BB_MAX_LEN) {
+ p[lo] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ /* does not all fit in one range,
+ * make p[lo] maximal
+ */
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ }
+ }
+ if (sectors && hi < bb->count) {
+ /* 'hi' points to the first range that starts after 's'.
+ * Maybe we can merge with the start of that range */
+ sector_t a = BB_OFFSET(p[hi]);
+ sector_t e = a + BB_LEN(p[hi]);
+ int ack = BB_ACK(p[hi]);
+ if (a <= s + sectors) {
+ /* merging is possible */
+ if (e <= s + sectors) {
+ /* full overlap */
+ e = s + sectors;
+ ack = acknowledged;
+ } else
+ ack = ack && acknowledged;
+
+ a = s;
+ if (e - a <= BB_MAX_LEN) {
+ p[hi] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ lo = hi;
+ hi++;
+ }
+ }
+ if (sectors == 0 && hi < bb->count) {
+ /* we might be able to combine lo and hi */
+ /* Note: 's' is at the end of 'lo' */
+ sector_t a = BB_OFFSET(p[hi]);
+ int lolen = BB_LEN(p[lo]);
+ int hilen = BB_LEN(p[hi]);
+ int newlen = lolen + hilen - (s - a);
+ if (s >= a && newlen < BB_MAX_LEN) {
+ /* yes, we can combine them */
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+ memmove(p + hi, p + hi + 1,
+ (bb->count - hi - 1) * 8);
+ bb->count--;
+ }
+ }
+ while (sectors) {
+ /* didn't merge (it all).
+ * Need to add a range just before 'hi' */
+ if (bb->count >= MD_MAX_BADBLOCKS) {
+ /* No room for more */
+ rv = 0;
+ break;
+ } else {
+ int this_sectors = sectors;
+ memmove(p + hi + 1, p + hi,
+ (bb->count - hi) * 8);
+ bb->count++;
+
+ if (this_sectors > BB_MAX_LEN)
+ this_sectors = BB_MAX_LEN;
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+ sectors -= this_sectors;
+ s += this_sectors;
+ }
+ }
+
+ bb->changed = 1;
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ write_sequnlock_irq(&bb->lock);
+
+ return rv;
+}
+
+int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
+ int acknowledged)
+{
+ int rv = md_set_badblocks(&rdev->badblocks,
+ s + rdev->data_offset, sectors, acknowledged);
+ if (rv) {
+ /* Make sure they get written out promptly */
+ set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+ md_wakeup_thread(rdev->mddev->thread);
+ }
+ return rv;
+}
+EXPORT_SYMBOL_GPL(rdev_set_badblocks);
+
+/*
+ * Remove a range of bad blocks from the table.
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ */
+static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
+{
+ u64 *p;
+ int lo, hi;
+ sector_t target = s + sectors;
+ int rv = 0;
+
+ if (bb->shift > 0) {
+ /* When clearing we round the start up and the end down.
+ * This should not matter as the shift should align with
+ * the block size and no rounding should ever be needed.
+ * However it is better the think a block is bad when it
+ * isn't than to think a block is not bad when it is.
+ */
+ s += (1<<bb->shift) - 1;
+ s >>= bb->shift;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts before 'target' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+ if (a < target)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo) {
+ /* p[lo] is the last range that could overlap the
+ * current range. Earlier ranges could also overlap,
+ * but only this one can overlap the end of the range.
+ */
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ /* Partial overlap, leave the tail of this range */
+ int ack = BB_ACK(p[lo]);
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t end = a + BB_LEN(p[lo]);
+
+ if (a < s) {
+ /* we need to split this range */
+ if (bb->count >= MD_MAX_BADBLOCKS) {
+ rv = 0;
+ goto out;
+ }
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+ bb->count++;
+ p[lo] = BB_MAKE(a, s-a, ack);
+ lo++;
+ }
+ p[lo] = BB_MAKE(target, end - target, ack);
+ /* there is no longer an overlap */
+ hi = lo;
+ lo--;
+ }
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ /* This range does overlap */
+ if (BB_OFFSET(p[lo]) < s) {
+ /* Keep the early parts of this range. */
+ int ack = BB_ACK(p[lo]);
+ sector_t start = BB_OFFSET(p[lo]);
+ p[lo] = BB_MAKE(start, s - start, ack);
+ /* now low doesn't overlap, so.. */
+ break;
+ }
+ lo--;
+ }
+ /* 'lo' is strictly before, 'hi' is strictly after,
+ * anything between needs to be discarded
+ */
+ if (hi - lo > 1) {
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+ bb->count -= (hi - lo - 1);
+ }
+ }
+
+ bb->changed = 1;
+out:
+ write_sequnlock_irq(&bb->lock);
+ return rv;
+}
+
+int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
+{
+ return md_clear_badblocks(&rdev->badblocks,
+ s + rdev->data_offset,
+ sectors);
+}
+EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
+
+/*
+ * Acknowledge all bad blocks in a list.
+ * This only succeeds if ->changed is clear. It is used by
+ * in-kernel metadata updates
+ */
+void md_ack_all_badblocks(struct badblocks *bb)
+{
+ if (bb->page == NULL || bb->changed)
+ /* no point even trying */
+ return;
+ write_seqlock_irq(&bb->lock);
+
+ if (bb->changed == 0) {
+ u64 *p = bb->page;
+ int i;
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ sector_t start = BB_OFFSET(p[i]);
+ int len = BB_LEN(p[i]);
+ p[i] = BB_MAKE(start, len, 1);
+ }
+ }
+ bb->unacked_exist = 0;
+ }
+ write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
+
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ * are recorded as bad. The list is truncated to fit within
+ * the one-page limit of sysfs.
+ * Writing "sector length" to this file adds an acknowledged
+ * bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ * been acknowledged. Writing to this file adds bad blocks
+ * without acknowledging them. This is largely for testing.
+ */
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+ size_t len;
+ int i;
+ u64 *p = bb->page;
+ unsigned seq;
+
+ if (bb->shift < 0)
+ return 0;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ len = 0;
+ i = 0;
+
+ while (len < PAGE_SIZE && i < bb->count) {
+ sector_t s = BB_OFFSET(p[i]);
+ unsigned int length = BB_LEN(p[i]);
+ int ack = BB_ACK(p[i]);
+ i++;
+
+ if (unack && ack)
+ continue;
+
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+ (unsigned long long)s << bb->shift,
+ length << bb->shift);
+ }
+ if (unack && len == 0)
+ bb->unacked_exist = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return len;
+}
+
+#define DO_DEBUG 1
+
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
+{
+ unsigned long long sector;
+ int length;
+ char newline;
+#ifdef DO_DEBUG
+ /* Allow clearing via sysfs *only* for testing/debugging.
+ * Normally only a successful write may clear a badblock
+ */
+ int clear = 0;
+ if (page[0] == '-') {
+ clear = 1;
+ page++;
+ }
+#endif /* DO_DEBUG */
+
+ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+ case 3:
+ if (newline != '\n')
+ return -EINVAL;
+ case 2:
+ if (length <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+#ifdef DO_DEBUG
+ if (clear) {
+ md_clear_badblocks(bb, sector, length);
+ return len;
+ }
+#endif /* DO_DEBUG */
+ if (md_set_badblocks(bb, sector, length, !unack))
+ return len;
+ else
+ return -ENOSPC;
+}
+
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c26c7a08ae..1e586bb4452 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,13 @@
typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t;
+/* Bad block numbers are stored sorted in a single page.
+ * 64bits is used for each block or extent.
+ * 54 bits are sector number, 9 bits are extent size,
+ * 1 bit is an 'acknowledged' flag.
+ */
+#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
+
/*
* MD's 'extended' device
*/
@@ -48,7 +55,7 @@ struct mdk_rdev_s
struct block_device *meta_bdev;
struct block_device *bdev; /* block device handle */
- struct page *sb_page;
+ struct page *sb_page, *bb_page;
int sb_loaded;
__u64 sb_events;
sector_t data_offset; /* start of data in array */
@@ -74,9 +81,29 @@ struct mdk_rdev_s
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */
-#define Blocked 8 /* An error occurred on an externally
- * managed array, don't allow writes
+#define Blocked 8 /* An error occurred but has not yet
+ * been acknowledged by the metadata
+ * handler, so don't allow writes
* until it is cleared */
+#define WriteErrorSeen 9 /* A write error has been seen on this
+ * device
+ */
+#define FaultRecorded 10 /* Intermediate state for clearing
+ * Blocked. The Fault is/will-be
+ * recorded in the metadata, but that
+ * metadata hasn't been stored safely
+ * on disk yet.
+ */
+#define BlockedBadBlocks 11 /* A writer is blocked because they
+ * found an unacknowledged bad-block.
+ * This can safely be cleared at any
+ * time, and the writer will re-check.
+ * It may be set at any time, and at
+ * worst the writer will timeout and
+ * re-check. So setting it as
+ * accurately as possible is good, but
+ * not absolutely critical.
+ */
wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */
@@ -111,8 +138,54 @@ struct mdk_rdev_s
struct sysfs_dirent *sysfs_state; /* handle for 'state'
* sysfs entry */
+
+ struct badblocks {
+ int count; /* count of bad blocks */
+ int unacked_exist; /* there probably are unacknowledged
+ * bad blocks. This is only cleared
+ * when a read discovers none
+ */
+ int shift; /* shift from sectors to block size
+ * a -ve shift means badblocks are
+ * disabled.*/
+ u64 *page; /* badblock list */
+ int changed;
+ seqlock_t lock;
+
+ sector_t sector;
+ sector_t size; /* in sectors */
+ } badblocks;
};
+#define BB_LEN_MASK (0x00000000000001FFULL)
+#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
+#define BB_ACK_MASK (0x8000000000000000ULL)
+#define BB_MAX_LEN 512
+#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
+#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
+#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
+#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+
+extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors);
+static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ if (unlikely(rdev->badblocks.count)) {
+ int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+ sectors,
+ first_bad, bad_sectors);
+ if (rv)
+ *first_bad -= rdev->data_offset;
+ return rv;
+ }
+ return 0;
+}
+extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
+ int acknowledged);
+extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
+extern void md_ack_all_badblocks(struct badblocks *bb);
+
struct mddev_s
{
void *private;
@@ -239,9 +312,12 @@ struct mddev_s
#define MD_RECOVERY_FROZEN 9
unsigned long recovery;
- int recovery_disabled; /* if we detect that recovery
- * will always fail, set this
- * so we don't loop trying */
+ /* If a RAID personality determines that recovery (of a particular
+ * device) will fail due to a read error on the source device, it
+ * takes a copy of this number and does not attempt recovery again
+ * until this number changes.
+ */
+ int recovery_disabled;
int in_sync; /* know to not need resync */
/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
@@ -304,11 +380,6 @@ struct mddev_s
* hot-adding a bitmap. It should
* eventually be settable by sysfs.
*/
- /* When md is serving under dm, it might use a
- * dirty_log to store the bits.
- */
- struct dm_dirty_log *log;
-
struct mutex mutex;
unsigned long chunksize;
unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev)
return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
}
+static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+}
+
+static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+}
+
/*
* iterates through some rdev ringlist. It's safe to remove the
* current 'rdev'. Dont touch 'tmp' though.
@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev);
extern int md_run(mddev_t *mddev);
extern void md_stop(mddev_t *mddev);
extern void md_stop_writes(mddev_t *mddev);
-extern void md_rdev_init(mdk_rdev_t *rdev);
+extern int md_rdev_init(mdk_rdev_t *rdev);
extern void mddev_suspend(mddev_t *mddev);
extern void mddev_resume(mddev_t *mddev);
@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
mddev_t *mddev);
extern int mddev_check_plugged(mddev_t *mddev);
+extern void md_trim_bio(struct bio *bio, int offset, int size);
#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d844..32323f0afd8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,16 +35,13 @@
#include <linux/delay.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
#include "md.h"
#include "raid1.h"
#include "bitmap.h"
#define DEBUG 0
-#if DEBUG
-#define PRINTK(x...) printk(x)
-#else
-#define PRINTK(x...)
-#endif
+#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
/*
* Number of guaranteed r1bios in case of extreme VM load:
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
for (i = 0; i < conf->raid_disks; i++) {
struct bio **bio = r1_bio->bios + i;
- if (*bio && *bio != IO_BLOCKED)
+ if (!BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
}
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
{
conf_t *conf = r1_bio->mddev->private;
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
-
put_all_bios(conf, r1_bio);
mempool_free(r1_bio, conf->r1bio_pool);
}
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
+static void call_bio_endio(r1bio_t *r1_bio)
+{
+ struct bio *bio = r1_bio->master_bio;
+ int done;
+ conf_t *conf = r1_bio->mddev->private;
+
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else
+ done = 1;
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (done) {
+ bio_endio(bio, 0);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+ }
+}
+
static void raid_end_bio_io(r1bio_t *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
(unsigned long long) bio->bi_sector +
(bio->bi_size >> 9) - 1);
- bio_endio(bio,
- test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+ call_bio_endio(r1_bio);
}
free_r1bio(r1_bio);
}
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error)
* oops, read error:
*/
char b[BDEVNAME_SIZE];
- if (printk_ratelimit())
- printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
- mdname(conf->mddev),
- bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+ printk_ratelimited(
+ KERN_ERR "md/raid1:%s: %s: "
+ "rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(conf->mirrors[mirror].rdev->bdev,
+ b),
+ (unsigned long long)r1_bio->sector);
+ set_bit(R1BIO_ReadError, &r1_bio->state);
reschedule_retry(r1_bio);
}
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
+static void close_write(r1bio_t *r1_bio)
+{
+ /* it really is the end of this request */
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_bvecs[i].bv_page);
+ kfree(r1_bio->behind_bvecs);
+ r1_bio->behind_bvecs = NULL;
+ }
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(r1_bio->mddev);
+}
+
static void r1_bio_write_done(r1bio_t *r1_bio)
{
- if (atomic_dec_and_test(&r1_bio->remaining))
- {
- /* it really is the end of this request */
- if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = r1_bio->behind_page_count;
- while (i--)
- safe_put_page(r1_bio->behind_pages[i]);
- kfree(r1_bio->behind_pages);
- r1_bio->behind_pages = NULL;
- }
- /* clear the bitmap if all writes complete successfully */
- bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
- r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
- md_write_end(r1_bio->mddev);
- raid_end_bio_io(r1_bio);
+ if (!atomic_dec_and_test(&r1_bio->remaining))
+ return;
+
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ close_write(r1_bio);
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else
+ raid_end_bio_io(r1_bio);
}
}
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
/*
* 'one mirror IO has finished' event handler:
*/
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
if (!uptodate) {
- md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
- /* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- } else
+ set_bit(WriteErrorSeen,
+ &conf->mirrors[mirror].rdev->flags);
+ set_bit(R1BIO_WriteError, &r1_bio->state);
+ } else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
* will return a good error code for to the higher
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error)
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/
+ sector_t first_bad;
+ int bad_sectors;
+
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
set_bit(R1BIO_Uptodate, &r1_bio->state);
+ /* Maybe we can clear some bad blocks. */
+ if (is_badblock(conf->mirrors[mirror].rdev,
+ r1_bio->sector, r1_bio->sectors,
+ &first_bad, &bad_sectors)) {
+ r1_bio->bios[mirror] = IO_MADE_GOOD;
+ set_bit(R1BIO_MadeGood, &r1_bio->state);
+ }
+ }
+
update_head_pos(mirror, r1_bio);
if (behind) {
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
(unsigned long long) mbio->bi_sector,
(unsigned long long) mbio->bi_sector +
(mbio->bi_size >> 9) - 1);
- bio_endio(mbio, 0);
+ call_bio_endio(r1_bio);
}
}
}
- rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+ if (r1_bio->bios[mirror] == NULL)
+ rdev_dec_pending(conf->mirrors[mirror].rdev,
+ conf->mddev);
/*
* Let's see if all mirrored write operations have finished
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
*
* The rdev for the device selected will have nr_pending incremented.
*/
-static int read_balance(conf_t *conf, r1bio_t *r1_bio)
+static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
{
const sector_t this_sector = r1_bio->sector;
- const int sectors = r1_bio->sectors;
+ int sectors;
+ int best_good_sectors;
int start_disk;
int best_disk;
int i;
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
* We take the first readable disk when above the resync window.
*/
retry:
+ sectors = r1_bio->sectors;
best_disk = -1;
best_dist = MaxSector;
+ best_good_sectors = 0;
+
if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) {
choose_first = 1;
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
for (i = 0 ; i < conf->raid_disks ; i++) {
sector_t dist;
+ sector_t first_bad;
+ int bad_sectors;
+
int disk = start_disk + i;
if (disk >= conf->raid_disks)
disk -= conf->raid_disks;
@@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* This is a reasonable device to use. It might
* even be best.
*/
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (best_dist < MaxSector)
+ /* already have a better device */
+ continue;
+ if (first_bad <= this_sector) {
+ /* cannot read here. If this is the 'primary'
+ * device, then we must not read beyond
+ * bad_sectors from another device..
+ */
+ bad_sectors -= (this_sector - first_bad);
+ if (choose_first && sectors > bad_sectors)
+ sectors = bad_sectors;
+ if (best_good_sectors > sectors)
+ best_good_sectors = sectors;
+
+ } else {
+ sector_t good_sectors = first_bad - this_sector;
+ if (good_sectors > best_good_sectors) {
+ best_good_sectors = good_sectors;
+ best_disk = disk;
+ }
+ if (choose_first)
+ break;
+ }
+ continue;
+ } else
+ best_good_sectors = sectors;
+
dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first
/* Don't change to another disk for sequential reads */
@@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
rdev_dec_pending(rdev, conf->mddev);
goto retry;
}
+ sectors = best_good_sectors;
conf->next_seq_sect = this_sector + sectors;
conf->last_used = best_disk;
}
rcu_read_unlock();
+ *max_sectors = sectors;
return best_disk;
}
@@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
{
int i;
struct bio_vec *bvec;
- struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
+ struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
GFP_NOIO);
- if (unlikely(!pages))
+ if (unlikely(!bvecs))
return;
bio_for_each_segment(bvec, bio, i) {
- pages[i] = alloc_page(GFP_NOIO);
- if (unlikely(!pages[i]))
+ bvecs[i] = *bvec;
+ bvecs[i].bv_page = alloc_page(GFP_NOIO);
+ if (unlikely(!bvecs[i].bv_page))
goto do_sync_io;
- memcpy(kmap(pages[i]) + bvec->bv_offset,
- kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
- kunmap(pages[i]);
+ memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(bvecs[i].bv_page);
kunmap(bvec->bv_page);
}
- r1_bio->behind_pages = pages;
+ r1_bio->behind_bvecs = bvecs;
r1_bio->behind_page_count = bio->bi_vcnt;
set_bit(R1BIO_BehindIO, &r1_bio->state);
return;
do_sync_io:
for (i = 0; i < bio->bi_vcnt; i++)
- if (pages[i])
- put_page(pages[i]);
- kfree(pages);
+ if (bvecs[i].bv_page)
+ put_page(bvecs[i].bv_page);
+ kfree(bvecs);
PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
}
@@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mirror_info_t *mirror;
r1bio_t *r1_bio;
struct bio *read_bio;
- int i, targets = 0, disks;
+ int i, disks;
struct bitmap *bitmap;
unsigned long flags;
const int rw = bio_data_dir(bio);
@@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
int plugged;
+ int first_clone;
+ int sectors_handled;
+ int max_sectors;
/*
* Register the new request and wait if the reconstruction
@@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio)
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
+ /* We might need to issue multiple reads to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of reads in bio->bi_phys_segments.
+ * If this is 0, there is only one r1_bio and no locking
+ * will be needed when requests complete. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
if (rw == READ) {
/*
* read balancing logic:
*/
- int rdisk = read_balance(conf, r1_bio);
+ int rdisk;
+
+read_again:
+ rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
@@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
r1_bio->read_disk = rdisk;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
+ max_sectors);
r1_bio->bios[rdisk] = read_bio;
@@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio)
read_bio->bi_rw = READ | do_sync;
read_bio->bi_private = r1_bio;
- generic_make_request(read_bio);
+ if (max_sectors < r1_bio->sectors) {
+ /* could not read all from this device, so we will
+ * need another r1_bio.
+ */
+
+ sectors_handled = (r1_bio->sector + max_sectors
+ - bio->bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ /* Cannot call generic_make_request directly
+ * as that will be queued in __make_request
+ * and subsequent mempool_alloc might block waiting
+ * for it. So hand bio over to raid1d.
+ */
+ reschedule_retry(r1_bio);
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_sector + sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
return 0;
}
/*
* WRITE:
*/
- /* first select target devices under spinlock and
+ /* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
+ * If there are known/acknowledged bad blocks on any device on
+ * which we have seen a write error, we want to avoid writing those
+ * blocks.
+ * This potentially requires several writes to write around
+ * the bad blocks. Each set of writes gets it's own r1bio
+ * with a set of bios attached.
*/
plugged = mddev_check_plugged(mddev);
@@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
+ max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio)
blocked_rdev = rdev;
break;
}
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- if (test_bit(Faulty, &rdev->flags)) {
+ r1_bio->bios[i] = NULL;
+ if (!rdev || test_bit(Faulty, &rdev->flags)) {
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ continue;
+ }
+
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ int bad_sectors;
+ int is_bad;
+
+ is_bad = is_badblock(rdev, r1_bio->sector,
+ max_sectors,
+ &first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /* mustn't write here until the bad block is
+ * acknowledged*/
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (is_bad && first_bad <= r1_bio->sector) {
+ /* Cannot write here at all */
+ bad_sectors -= (r1_bio->sector - first_bad);
+ if (bad_sectors < max_sectors)
+ /* mustn't write more than bad_sectors
+ * to other devices yet
+ */
+ max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
- r1_bio->bios[i] = NULL;
- } else {
- r1_bio->bios[i] = bio;
- targets++;
+ /* We don't set R1BIO_Degraded as that
+ * only applies if the disk is
+ * missing, so it might be re-added,
+ * and we want to know to recover this
+ * chunk.
+ * In this case the device is here,
+ * and the fact that this chunk is not
+ * in-sync is recorded in the bad
+ * block log
+ */
+ continue;
}
- } else
- r1_bio->bios[i] = NULL;
+ if (is_bad) {
+ int good_sectors = first_bad - r1_bio->sector;
+ if (good_sectors < max_sectors)
+ max_sectors = good_sectors;
+ }
+ }
+ r1_bio->bios[i] = bio;
}
rcu_read_unlock();
@@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio)
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
-
+ r1_bio->state = 0;
allow_barrier(conf);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf);
goto retry_write;
}
- BUG_ON(targets == 0); /* we never fail the last device */
-
- if (targets < conf->raid_disks) {
- /* array is degraded, we will not clear the bitmap
- * on I/O completion (see raid1_end_write_request) */
- set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (max_sectors < r1_bio->sectors) {
+ /* We are splitting this write into multiple parts, so
+ * we need to prepare for allocating another r1_bio.
+ */
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
}
-
- /* do behind I/O ?
- * Not if there are too many, or cannot allocate memory,
- * or a reader on WriteMostly is waiting for behind writes
- * to flush */
- if (bitmap &&
- (atomic_read(&bitmap->behind_writes)
- < mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait))
- alloc_behind_pages(bio, r1_bio);
+ sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
- bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ first_clone = 1;
for (i = 0; i < disks; i++) {
struct bio *mbio;
if (!r1_bio->bios[i])
continue;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- r1_bio->bios[i] = mbio;
-
- mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
- mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_flush_fua | do_sync;
- mbio->bi_private = r1_bio;
-
- if (r1_bio->behind_pages) {
+ md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+
+ if (first_clone) {
+ /* do behind I/O ?
+ * Not if there are too many, or cannot
+ * allocate memory, or a reader on WriteMostly
+ * is waiting for behind writes to flush */
+ if (bitmap &&
+ (atomic_read(&bitmap->behind_writes)
+ < mddev->bitmap_info.max_write_behind) &&
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(mbio, r1_bio);
+
+ bitmap_startwrite(bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ test_bit(R1BIO_BehindIO,
+ &r1_bio->state));
+ first_clone = 0;
+ }
+ if (r1_bio->behind_bvecs) {
struct bio_vec *bvec;
int j;
@@ -894,11 +1080,20 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* them all
*/
__bio_for_each_segment(bvec, mbio, j, 0)
- bvec->bv_page = r1_bio->behind_pages[j];
+ bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
+ r1_bio->bios[i] = mbio;
+
+ mbio->bi_sector = (r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset);
+ mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ mbio->bi_end_io = raid1_end_write_request;
+ mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+ mbio->bi_private = r1_bio;
+
atomic_inc(&r1_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
@@ -909,6 +1104,19 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
+ if (sectors_handled < (bio->bi_size >> 9)) {
+ /* We need another r1_bio. It has already been counted
+ * in bio->bi_phys_segments
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_sector + sectors_handled;
+ goto retry_write;
+ }
+
if (do_sync || !bitmap || !plugged)
md_wakeup_thread(mddev->thread);
@@ -952,9 +1160,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* However don't try a recovery from this drive as
* it is very likely to fail.
*/
- mddev->recovery_disabled = 1;
+ conf->recovery_disabled = mddev->recovery_disabled;
return;
}
+ set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
@@ -1027,7 +1236,7 @@ static int raid1_spare_active(mddev_t *mddev)
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
- sysfs_notify_dirent(rdev->sysfs_state);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -1048,6 +1257,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int first = 0;
int last = mddev->raid_disks - 1;
+ if (mddev->recovery_disabled == conf->recovery_disabled)
+ return -EBUSY;
+
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
@@ -1103,7 +1315,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
* is not possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
- !mddev->recovery_disabled &&
+ mddev->recovery_disabled != conf->recovery_disabled &&
mddev->degraded < conf->raid_disks) {
err = -EBUSY;
goto abort;
@@ -1155,6 +1367,8 @@ static void end_sync_write(struct bio *bio, int error)
conf_t *conf = mddev->private;
int i;
int mirror=0;
+ sector_t first_bad;
+ int bad_sectors;
for (i = 0; i < conf->raid_disks; i++)
if (r1_bio->bios[i] == bio) {
@@ -1172,18 +1386,48 @@ static void end_sync_write(struct bio *bio, int error)
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
- md_error(mddev, conf->mirrors[mirror].rdev);
- }
+ set_bit(WriteErrorSeen,
+ &conf->mirrors[mirror].rdev->flags);
+ set_bit(R1BIO_WriteError, &r1_bio->state);
+ } else if (is_badblock(conf->mirrors[mirror].rdev,
+ r1_bio->sector,
+ r1_bio->sectors,
+ &first_bad, &bad_sectors) &&
+ !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+ r1_bio->sector,
+ r1_bio->sectors,
+ &first_bad, &bad_sectors)
+ )
+ set_bit(R1BIO_MadeGood, &r1_bio->state);
update_head_pos(mirror, r1_bio);
if (atomic_dec_and_test(&r1_bio->remaining)) {
- sector_t s = r1_bio->sectors;
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
}
}
+static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+ int sectors, struct page *page, int rw)
+{
+ if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+ /* success */
+ return 1;
+ if (rw == WRITE)
+ set_bit(WriteErrorSeen, &rdev->flags);
+ /* need to record an error - either for the block or the device */
+ if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+ md_error(rdev->mddev, rdev);
+ return 0;
+}
+
static int fix_sync_read_error(r1bio_t *r1_bio)
{
/* Try some synchronous reads of other devices to get
@@ -1193,6 +1437,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
* We don't need to freeze the array, because being in an
* active sync request, there is no normal IO, and
* no overlapping syncs.
+ * We don't need to check is_badblock() again as we
+ * made sure that anything with a bad block in range
+ * will have bi_end_io clear.
*/
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
@@ -1217,9 +1464,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
* active, and resync is currently active
*/
rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
+ if (sync_page_io(rdev, sect, s<<9,
bio->bi_io_vec[idx].bv_page,
READ, false)) {
success = 1;
@@ -1233,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
if (!success) {
char b[BDEVNAME_SIZE];
- /* Cannot read from anywhere, array is toast */
- md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ int abort = 0;
+ /* Cannot read from anywhere, this block is lost.
+ * Record a bad block on each device. If that doesn't
+ * work just disable and interrupt the recovery.
+ * Don't fail devices as that won't really help.
+ */
printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
- md_done_sync(mddev, r1_bio->sectors, 0);
- put_buf(r1_bio);
- return 0;
+ for (d = 0; d < conf->raid_disks; d++) {
+ rdev = conf->mirrors[d].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!rdev_set_badblocks(rdev, sect, s, 0))
+ abort = 1;
+ }
+ if (abort) {
+ mddev->recovery_disabled = 1;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
+ return 0;
+ }
+ /* Try next page */
+ sectors -= s;
+ sect += s;
+ idx++;
+ continue;
}
start = d;
@@ -1254,16 +1519,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- WRITE, false) == 0) {
+ if (r1_sync_page_io(rdev, sect, s,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
- md_error(mddev, rdev);
- } else
- atomic_add(s, &rdev->corrected_errors);
+ }
}
d = start;
while (d != r1_bio->read_disk) {
@@ -1273,12 +1534,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- READ, false) == 0)
- md_error(mddev, rdev);
+ if (r1_sync_page_io(rdev, sect, s,
+ bio->bi_io_vec[idx].bv_page,
+ READ) != 0)
+ atomic_add(s, &rdev->corrected_errors);
}
sectors -= s;
sect += s;
@@ -1420,7 +1679,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
*
* 1. Retries failed read operations on working mirrors.
* 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
+ * 3. Performs writes following reads for array synchronising.
*/
static void fix_read_error(conf_t *conf, int read_disk,
@@ -1443,9 +1702,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
* which is the thread that might remove
* a device. If raid1d ever becomes multi-threaded....
*/
+ sector_t first_bad;
+ int bad_sectors;
+
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
+ is_badblock(rdev, sect, s,
+ &first_bad, &bad_sectors) == 0 &&
sync_page_io(rdev, sect, s<<9,
conf->tmppage, READ, false))
success = 1;
@@ -1457,8 +1721,10 @@ static void fix_read_error(conf_t *conf, int read_disk,
} while (!success && d != read_disk);
if (!success) {
- /* Cannot read from anywhere -- bye bye array */
- md_error(mddev, conf->mirrors[read_disk].rdev);
+ /* Cannot read from anywhere - mark it bad */
+ mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
+ if (!rdev_set_badblocks(rdev, sect, s, 0))
+ md_error(mddev, rdev);
break;
}
/* write it back and re-read */
@@ -1469,13 +1735,9 @@ static void fix_read_error(conf_t *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev, sect, s<<9,
- conf->tmppage, WRITE, false)
- == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- }
+ test_bit(In_sync, &rdev->flags))
+ r1_sync_page_io(rdev, sect, s,
+ conf->tmppage, WRITE);
}
d = start;
while (d != read_disk) {
@@ -1486,12 +1748,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev, sect, s<<9,
- conf->tmppage, READ, false)
- == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- else {
+ if (r1_sync_page_io(rdev, sect, s,
+ conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
printk(KERN_INFO
"md/raid1:%s: read error corrected "
@@ -1508,21 +1766,255 @@ static void fix_read_error(conf_t *conf, int read_disk,
}
}
+static void bi_complete(struct bio *bio, int error)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static int submit_bio_wait(int rw, struct bio *bio)
+{
+ struct completion event;
+ rw |= REQ_SYNC;
+
+ init_completion(&event);
+ bio->bi_private = &event;
+ bio->bi_end_io = bi_complete;
+ submit_bio(rw, bio);
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio->bi_flags);
+}
+
+static int narrow_write_error(r1bio_t *r1_bio, int i)
+{
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+ int vcnt, idx;
+ struct bio_vec *vec;
+
+ /* bio has the data to be written to device 'i' where
+ * we just recently had a write error.
+ * We repeatedly clone the bio and trim down to one block,
+ * then try the write. Where the write fails we record
+ * a bad block.
+ * It is conceivable that the bio doesn't exactly align with
+ * blocks. We must handle this somehow.
+ *
+ * We currently own a reference on the rdev.
+ */
+
+ int block_sectors;
+ sector_t sector;
+ int sectors;
+ int sect_to_write = r1_bio->sectors;
+ int ok = 1;
+
+ if (rdev->badblocks.shift < 0)
+ return 0;
+
+ block_sectors = 1 << rdev->badblocks.shift;
+ sector = r1_bio->sector;
+ sectors = ((sector + block_sectors)
+ & ~(sector_t)(block_sectors - 1))
+ - sector;
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ vcnt = r1_bio->behind_page_count;
+ vec = r1_bio->behind_bvecs;
+ idx = 0;
+ while (vec[idx].bv_page == NULL)
+ idx++;
+ } else {
+ vcnt = r1_bio->master_bio->bi_vcnt;
+ vec = r1_bio->master_bio->bi_io_vec;
+ idx = r1_bio->master_bio->bi_idx;
+ }
+ while (sect_to_write) {
+ struct bio *wbio;
+ if (sectors > sect_to_write)
+ sectors = sect_to_write;
+ /* Write at 'sector' for 'sectors'*/
+
+ wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+ memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+ wbio->bi_sector = r1_bio->sector;
+ wbio->bi_rw = WRITE;
+ wbio->bi_vcnt = vcnt;
+ wbio->bi_size = r1_bio->sectors << 9;
+ wbio->bi_idx = idx;
+
+ md_trim_bio(wbio, sector - r1_bio->sector, sectors);
+ wbio->bi_sector += rdev->data_offset;
+ wbio->bi_bdev = rdev->bdev;
+ if (submit_bio_wait(WRITE, wbio) == 0)
+ /* failure! */
+ ok = rdev_set_badblocks(rdev, sector,
+ sectors, 0)
+ && ok;
+
+ bio_put(wbio);
+ sect_to_write -= sectors;
+ sector += sectors;
+ sectors = block_sectors;
+ }
+ return ok;
+}
+
+static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
+{
+ int m;
+ int s = r1_bio->sectors;
+ for (m = 0; m < conf->raid_disks ; m++) {
+ mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+ struct bio *bio = r1_bio->bios[m];
+ if (bio->bi_end_io == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+ rdev_clear_badblocks(rdev, r1_bio->sector, s);
+ }
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ test_bit(R1BIO_WriteError, &r1_bio->state)) {
+ if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
+ md_error(conf->mddev, rdev);
+ }
+ }
+ put_buf(r1_bio);
+ md_done_sync(conf->mddev, s, 1);
+}
+
+static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
+{
+ int m;
+ for (m = 0; m < conf->raid_disks ; m++)
+ if (r1_bio->bios[m] == IO_MADE_GOOD) {
+ mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+ rdev_clear_badblocks(rdev,
+ r1_bio->sector,
+ r1_bio->sectors);
+ rdev_dec_pending(rdev, conf->mddev);
+ } else if (r1_bio->bios[m] != NULL) {
+ /* This drive got a write error. We need to
+ * narrow down and record precise write
+ * errors.
+ */
+ if (!narrow_write_error(r1_bio, m)) {
+ md_error(conf->mddev,
+ conf->mirrors[m].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
+ rdev_dec_pending(conf->mirrors[m].rdev,
+ conf->mddev);
+ }
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+}
+
+static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
+{
+ int disk;
+ int max_sectors;
+ mddev_t *mddev = conf->mddev;
+ struct bio *bio;
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
+
+ clear_bit(R1BIO_ReadError, &r1_bio->state);
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen
+ */
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector, r1_bio->sectors);
+ unfreeze_array(conf);
+ } else
+ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+ bdevname(bio->bi_bdev, b);
+read_more:
+ disk = read_balance(conf, r1_bio, &max_sectors);
+ if (disk == -1) {
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
+ " read error for block %llu\n",
+ mdname(mddev), b, (unsigned long long)r1_bio->sector);
+ raid_end_bio_io(r1_bio);
+ } else {
+ const unsigned long do_sync
+ = r1_bio->master_bio->bi_rw & REQ_SYNC;
+ if (bio) {
+ r1_bio->bios[r1_bio->read_disk] =
+ mddev->ro ? IO_BLOCKED : NULL;
+ bio_put(bio);
+ }
+ r1_bio->read_disk = disk;
+ bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+ md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+ r1_bio->bios[r1_bio->read_disk] = bio;
+ rdev = conf->mirrors[disk].rdev;
+ printk_ratelimited(KERN_ERR
+ "md/raid1:%s: redirecting sector %llu"
+ " to other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(rdev->bdev, b));
+ bio->bi_sector = r1_bio->sector + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_end_io = raid1_end_read_request;
+ bio->bi_rw = READ | do_sync;
+ bio->bi_private = r1_bio;
+ if (max_sectors < r1_bio->sectors) {
+ /* Drat - have to split this up more */
+ struct bio *mbio = r1_bio->master_bio;
+ int sectors_handled = (r1_bio->sector + max_sectors
+ - mbio->bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (mbio->bi_phys_segments == 0)
+ mbio->bi_phys_segments = 2;
+ else
+ mbio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ generic_make_request(bio);
+ bio = NULL;
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = mbio;
+ r1_bio->sectors = (mbio->bi_size >> 9)
+ - sectors_handled;
+ r1_bio->state = 0;
+ set_bit(R1BIO_ReadError, &r1_bio->state);
+ r1_bio->mddev = mddev;
+ r1_bio->sector = mbio->bi_sector + sectors_handled;
+
+ goto read_more;
+ } else
+ generic_make_request(bio);
+ }
+}
+
static void raid1d(mddev_t *mddev)
{
r1bio_t *r1_bio;
- struct bio *bio;
unsigned long flags;
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
- mdk_rdev_t *rdev;
struct blk_plug plug;
md_check_recovery(mddev);
blk_start_plug(&plug);
for (;;) {
- char b[BDEVNAME_SIZE];
if (atomic_read(&mddev->plug_cnt) == 0)
flush_pending_writes(conf);
@@ -1539,62 +2031,26 @@ static void raid1d(mddev_t *mddev)
mddev = r1_bio->mddev;
conf = mddev->private;
- if (test_bit(R1BIO_IsSync, &r1_bio->state))
- sync_request_write(mddev, r1_bio);
- else {
- int disk;
-
- /* we got a read error. Maybe the drive is bad. Maybe just
- * the block and we can fix it.
- * We freeze all other IO, and try reading the block from
- * other devices. When we find one, we re-write
- * and check it that fixes the read error.
- * This is all done synchronously while the array is
- * frozen
+ if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ handle_sync_write_finished(conf, r1_bio);
+ else
+ sync_request_write(mddev, r1_bio);
+ } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ handle_write_finished(conf, r1_bio);
+ else if (test_bit(R1BIO_ReadError, &r1_bio->state))
+ handle_read_error(conf, r1_bio);
+ else
+ /* just a partial read to be scheduled from separate
+ * context
*/
- if (mddev->ro == 0) {
- freeze_array(conf);
- fix_read_error(conf, r1_bio->read_disk,
- r1_bio->sector,
- r1_bio->sectors);
- unfreeze_array(conf);
- } else
- md_error(mddev,
- conf->mirrors[r1_bio->read_disk].rdev);
-
- bio = r1_bio->bios[r1_bio->read_disk];
- if ((disk=read_balance(conf, r1_bio)) == -1) {
- printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
- " read error for block %llu\n",
- mdname(mddev),
- bdevname(bio->bi_bdev,b),
- (unsigned long long)r1_bio->sector);
- raid_end_bio_io(r1_bio);
- } else {
- const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
- r1_bio->bios[r1_bio->read_disk] =
- mddev->ro ? IO_BLOCKED : NULL;
- r1_bio->read_disk = disk;
- bio_put(bio);
- bio = bio_clone_mddev(r1_bio->master_bio,
- GFP_NOIO, mddev);
- r1_bio->bios[r1_bio->read_disk] = bio;
- rdev = conf->mirrors[disk].rdev;
- if (printk_ratelimit())
- printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
- " other mirror: %s\n",
- mdname(mddev),
- (unsigned long long)r1_bio->sector,
- bdevname(rdev->bdev,b));
- bio->bi_sector = r1_bio->sector + rdev->data_offset;
- bio->bi_bdev = rdev->bdev;
- bio->bi_end_io = raid1_end_read_request;
- bio->bi_rw = READ | do_sync;
- bio->bi_private = r1_bio;
- generic_make_request(bio);
- }
- }
+ generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+
cond_resched();
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+ md_check_recovery(mddev);
}
blk_finish_plug(&plug);
}
@@ -1636,6 +2092,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int write_targets = 0, read_targets = 0;
sector_t sync_blocks;
int still_degraded = 0;
+ int good_sectors = RESYNC_SECTORS;
+ int min_bad = 0; /* number of sectors that are bad in all devices */
if (!conf->r1buf_pool)
if (init_resync(conf))
@@ -1723,36 +2181,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
- test_bit(Faulty, &rdev->flags)) {
+ test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
- continue;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_rw = WRITE;
bio->bi_end_io = end_sync_write;
write_targets ++;
} else {
/* may need to read from here */
- bio->bi_rw = READ;
- bio->bi_end_io = end_sync_read;
- if (test_bit(WriteMostly, &rdev->flags)) {
- if (wonly < 0)
- wonly = i;
- } else {
- if (disk < 0)
- disk = i;
+ sector_t first_bad = MaxSector;
+ int bad_sectors;
+
+ if (is_badblock(rdev, sector_nr, good_sectors,
+ &first_bad, &bad_sectors)) {
+ if (first_bad > sector_nr)
+ good_sectors = first_bad - sector_nr;
+ else {
+ bad_sectors -= (sector_nr - first_bad);
+ if (min_bad == 0 ||
+ min_bad > bad_sectors)
+ min_bad = bad_sectors;
+ }
+ }
+ if (sector_nr < first_bad) {
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ if (wonly < 0)
+ wonly = i;
+ } else {
+ if (disk < 0)
+ disk = i;
+ }
+ bio->bi_rw = READ;
+ bio->bi_end_io = end_sync_read;
+ read_targets++;
}
- read_targets++;
}
- atomic_inc(&rdev->nr_pending);
- bio->bi_sector = sector_nr + rdev->data_offset;
- bio->bi_bdev = rdev->bdev;
- bio->bi_private = r1_bio;
+ if (bio->bi_end_io) {
+ atomic_inc(&rdev->nr_pending);
+ bio->bi_sector = sector_nr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_private = r1_bio;
+ }
}
rcu_read_unlock();
if (disk < 0)
disk = wonly;
r1_bio->read_disk = disk;
+ if (read_targets == 0 && min_bad > 0) {
+ /* These sectors are bad on all InSync devices, so we
+ * need to mark them bad on all write targets
+ */
+ int ok = 1;
+ for (i = 0 ; i < conf->raid_disks ; i++)
+ if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
+ mdk_rdev_t *rdev =
+ rcu_dereference(conf->mirrors[i].rdev);
+ ok = rdev_set_badblocks(rdev, sector_nr,
+ min_bad, 0
+ ) && ok;
+ }
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ *skipped = 1;
+ put_buf(r1_bio);
+
+ if (!ok) {
+ /* Cannot record the badblocks, so need to
+ * abort the resync.
+ * If there are multiple read targets, could just
+ * fail the really bad ones ???
+ */
+ conf->recovery_disabled = mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ return 0;
+ } else
+ return min_bad;
+
+ }
+ if (min_bad > 0 && min_bad < good_sectors) {
+ /* only resync enough to reach the next bad->good
+ * transition */
+ good_sectors = min_bad;
+ }
+
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
/* extra read targets are also write targets */
write_targets += read_targets-1;
@@ -1769,6 +2280,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (max_sector > mddev->resync_max)
max_sector = mddev->resync_max; /* Don't do IO beyond here */
+ if (max_sector > sector_nr + good_sectors)
+ max_sector = sector_nr + good_sectors;
nr_sectors = 0;
sync_blocks = 0;
do {
@@ -2154,18 +2667,13 @@ static int raid1_reshape(mddev_t *mddev)
for (d = d2 = 0; d < conf->raid_disks; d++) {
mdk_rdev_t *rdev = conf->mirrors[d].rdev;
if (rdev && rdev->raid_disk != d2) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = d2;
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
- if (sysfs_create_link(&mddev->kobj,
- &rdev->kobj, nm))
+ sysfs_unlink_rdev(mddev, rdev);
+ if (sysfs_link_rdev(mddev, rdev))
printk(KERN_WARNING
- "md/raid1:%s: cannot register "
- "%s\n",
- mdname(mddev), nm);
+ "md/raid1:%s: cannot register rd%d\n",
+ mdname(mddev), rdev->raid_disk);
}
if (rdev)
newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e743a64fac4..e0d676b4897 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -48,6 +48,12 @@ struct r1_private_data_s {
* (fresh device added).
* Cleared when a sync completes.
*/
+ int recovery_disabled; /* when the same as
+ * mddev->recovery_disabled
+ * we don't allow recovery
+ * to be attempted as we
+ * expect a read error
+ */
wait_queue_head_t wait_barrier;
@@ -95,7 +101,7 @@ struct r1bio_s {
struct list_head retry_list;
/* Next two are only valid when R1BIO_BehindIO is set */
- struct page **behind_pages;
+ struct bio_vec *behind_bvecs;
int behind_page_count;
/*
* if the IO is in WRITE direction, then multiple bios are used.
@@ -110,13 +116,24 @@ struct r1bio_s {
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
-#define IO_BLOCKED ((struct bio*)1)
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting bios[n] to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r1bio.state */
#define R1BIO_Uptodate 0
#define R1BIO_IsSync 1
#define R1BIO_Degraded 2
#define R1BIO_BehindIO 3
+/* Set ReadError on bios that experience a readerror so that
+ * raid1d knows what to do with them.
+ */
+#define R1BIO_ReadError 4
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when
@@ -125,6 +142,11 @@ struct r1bio_s {
* Record that bi_end_io was called with this flag...
*/
#define R1BIO_Returned 6
+/* If a write for this request means we can clear some
+ * known-bad-block records, we set this flag
+ */
+#define R1BIO_MadeGood 7
+#define R1BIO_WriteError 8
extern int md_raid1_congested(mddev_t *mddev, int bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e846688962..8b29cd4f01c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
#include <linux/delay.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
#include "md.h"
#include "raid10.h"
#include "raid0.h"
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
for (j = 0 ; j < nalloc; j++) {
bio = r10_bio->devs[j].bio;
for (i = 0; i < RESYNC_PAGES; i++) {
- page = alloc_page(gfp_flags);
+ if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
+ &conf->mddev->recovery)) {
+ /* we can share bv_page's during recovery */
+ struct bio *rbio = r10_bio->devs[0].bio;
+ page = rbio->bi_io_vec[i].bv_page;
+ get_page(page);
+ } else
+ page = alloc_page(gfp_flags);
if (unlikely(!page))
goto out_free_pages;
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
for (i = 0; i < conf->copies; i++) {
struct bio **bio = & r10_bio->devs[i].bio;
- if (*bio && *bio != IO_BLOCKED)
+ if (!BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
}
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
{
conf_t *conf = r10_bio->mddev->private;
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
-
put_all_bios(conf, r10_bio);
mempool_free(r10_bio, conf->r10bio_pool);
}
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
static void raid_end_bio_io(r10bio_t *r10_bio)
{
struct bio *bio = r10_bio->master_bio;
+ int done;
+ conf_t *conf = r10_bio->mddev->private;
- bio_endio(bio,
- test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else
+ done = 1;
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (done) {
+ bio_endio(bio, 0);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+ }
free_r10bio(r10_bio);
}
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
r10_bio->devs[slot].addr + (r10_bio->sectors);
}
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
+ struct bio *bio, int *slotp)
+{
+ int slot;
+
+ for (slot = 0; slot < conf->copies; slot++)
+ if (r10_bio->devs[slot].bio == bio)
+ break;
+
+ BUG_ON(slot == conf->copies);
+ update_head_pos(slot, r10_bio);
+
+ if (slotp)
+ *slotp = slot;
+ return r10_bio->devs[slot].devnum;
+}
+
static void raid10_end_read_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -277,34 +317,45 @@ static void raid10_end_read_request(struct bio *bio, int error)
* oops, read error - keep the refcount on the rdev
*/
char b[BDEVNAME_SIZE];
- if (printk_ratelimit())
- printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
- mdname(conf->mddev),
- bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+ printk_ratelimited(KERN_ERR
+ "md/raid10:%s: %s: rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(conf->mirrors[dev].rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ set_bit(R10BIO_ReadError, &r10_bio->state);
reschedule_retry(r10_bio);
}
}
+static void close_write(r10bio_t *r10_bio)
+{
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+ r10_bio->sectors,
+ !test_bit(R10BIO_Degraded, &r10_bio->state),
+ 0);
+ md_write_end(r10_bio->mddev);
+}
+
static void raid10_end_write_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
r10bio_t *r10_bio = bio->bi_private;
- int slot, dev;
+ int dev;
+ int dec_rdev = 1;
conf_t *conf = r10_bio->mddev->private;
+ int slot;
- for (slot = 0; slot < conf->copies; slot++)
- if (r10_bio->devs[slot].bio == bio)
- break;
- dev = r10_bio->devs[slot].devnum;
+ dev = find_bio_disk(conf, r10_bio, bio, &slot);
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
if (!uptodate) {
- md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
- /* an I/O failed, we can't clear the bitmap */
- set_bit(R10BIO_Degraded, &r10_bio->state);
- } else
+ set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ dec_rdev = 0;
+ } else {
/*
* Set R10BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher
@@ -314,9 +365,22 @@ static void raid10_end_write_request(struct bio *bio, int error)
* user-side. So if something waits for IO, then it will
* wait for the 'master' bio.
*/
+ sector_t first_bad;
+ int bad_sectors;
+
set_bit(R10BIO_Uptodate, &r10_bio->state);
- update_head_pos(slot, r10_bio);
+ /* Maybe we can clear some bad blocks. */
+ if (is_badblock(conf->mirrors[dev].rdev,
+ r10_bio->devs[slot].addr,
+ r10_bio->sectors,
+ &first_bad, &bad_sectors)) {
+ bio_put(bio);
+ r10_bio->devs[slot].bio = IO_MADE_GOOD;
+ dec_rdev = 0;
+ set_bit(R10BIO_MadeGood, &r10_bio->state);
+ }
+ }
/*
*
@@ -324,16 +388,18 @@ static void raid10_end_write_request(struct bio *bio, int error)
* already.
*/
if (atomic_dec_and_test(&r10_bio->remaining)) {
- /* clear the bitmap if all writes complete successfully */
- bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
- r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- 0);
- md_write_end(r10_bio->mddev);
- raid_end_bio_io(r10_bio);
+ if (test_bit(R10BIO_WriteError, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else {
+ close_write(r10_bio);
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else
+ raid_end_bio_io(r10_bio);
+ }
}
-
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+ if (dec_rdev)
+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
@@ -484,11 +550,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
* FIXME: possibly should rethink readbalancing and do it differently
* depending on near_copies / far_copies geometry.
*/
-static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
{
const sector_t this_sector = r10_bio->sector;
int disk, slot;
- const int sectors = r10_bio->sectors;
+ int sectors = r10_bio->sectors;
+ int best_good_sectors;
sector_t new_distance, best_dist;
mdk_rdev_t *rdev;
int do_balance;
@@ -497,8 +564,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
retry:
+ sectors = r10_bio->sectors;
best_slot = -1;
best_dist = MaxSector;
+ best_good_sectors = 0;
do_balance = 1;
/*
* Check if we can balance. We can balance on the whole
@@ -511,6 +580,10 @@ retry:
do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) {
+ sector_t first_bad;
+ int bad_sectors;
+ sector_t dev_sector;
+
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
disk = r10_bio->devs[slot].devnum;
@@ -520,6 +593,37 @@ retry:
if (!test_bit(In_sync, &rdev->flags))
continue;
+ dev_sector = r10_bio->devs[slot].addr;
+ if (is_badblock(rdev, dev_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (best_dist < MaxSector)
+ /* Already have a better slot */
+ continue;
+ if (first_bad <= dev_sector) {
+ /* Cannot read here. If this is the
+ * 'primary' device, then we must not read
+ * beyond 'bad_sectors' from another device.
+ */
+ bad_sectors -= (dev_sector - first_bad);
+ if (!do_balance && sectors > bad_sectors)
+ sectors = bad_sectors;
+ if (best_good_sectors > sectors)
+ best_good_sectors = sectors;
+ } else {
+ sector_t good_sectors =
+ first_bad - dev_sector;
+ if (good_sectors > best_good_sectors) {
+ best_good_sectors = good_sectors;
+ best_slot = slot;
+ }
+ if (!do_balance)
+ /* Must read from here */
+ break;
+ }
+ continue;
+ } else
+ best_good_sectors = sectors;
+
if (!do_balance)
break;
@@ -561,6 +665,7 @@ retry:
} else
disk = -1;
rcu_read_unlock();
+ *max_sectors = best_good_sectors;
return disk;
}
@@ -734,6 +839,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
unsigned long flags;
mdk_rdev_t *blocked_rdev;
int plugged;
+ int sectors_handled;
+ int max_sectors;
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
@@ -808,12 +915,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
r10_bio->sector = bio->bi_sector;
r10_bio->state = 0;
+ /* We might need to issue multiple reads to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of reads in bio->bi_phys_segments.
+ * If this is 0, there is only one r10_bio and no locking
+ * will be needed when the request completes. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
if (rw == READ) {
/*
* read balancing logic:
*/
- int disk = read_balance(conf, r10_bio);
- int slot = r10_bio->read_slot;
+ int disk;
+ int slot;
+
+read_again:
+ disk = read_balance(conf, r10_bio, &max_sectors);
+ slot = r10_bio->read_slot;
if (disk < 0) {
raid_end_bio_io(r10_bio);
return 0;
@@ -821,6 +942,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mirror = conf->mirrors + disk;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
+ max_sectors);
r10_bio->devs[slot].bio = read_bio;
@@ -831,7 +954,37 @@ static int make_request(mddev_t *mddev, struct bio * bio)
read_bio->bi_rw = READ | do_sync;
read_bio->bi_private = r10_bio;
- generic_make_request(read_bio);
+ if (max_sectors < r10_bio->sectors) {
+ /* Could not read all from this device, so we will
+ * need another r10_bio.
+ */
+ sectors_handled = (r10_bio->sectors + max_sectors
+ - bio->bi_sector);
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock(&conf->device_lock);
+ /* Cannot call generic_make_request directly
+ * as that will be queued in __generic_make_request
+ * and subsequent mempool_alloc might block
+ * waiting for it. so hand bio over to raid10d.
+ */
+ reschedule_retry(r10_bio);
+
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = ((bio->bi_size >> 9)
+ - sectors_handled);
+ r10_bio->state = 0;
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_sector + sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
return 0;
}
@@ -841,13 +994,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
+ * If there are known/acknowledged bad blocks on any device
+ * on which we have seen a write error, we want to avoid
+ * writing to those blocks. This potentially requires several
+ * writes to write around the bad blocks. Each set of writes
+ * gets its own r10_bio with a set of bios attached. The number
+ * of r10_bios is recored in bio->bi_phys_segments just as with
+ * the read case.
*/
plugged = mddev_check_plugged(mddev);
raid10_find_phys(conf, r10_bio);
- retry_write:
+retry_write:
blocked_rdev = NULL;
rcu_read_lock();
+ max_sectors = r10_bio->sectors;
+
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -856,13 +1018,55 @@ static int make_request(mddev_t *mddev, struct bio * bio)
blocked_rdev = rdev;
break;
}
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- r10_bio->devs[i].bio = bio;
- } else {
- r10_bio->devs[i].bio = NULL;
+ r10_bio->devs[i].bio = NULL;
+ if (!rdev || test_bit(Faulty, &rdev->flags)) {
set_bit(R10BIO_Degraded, &r10_bio->state);
+ continue;
}
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ sector_t dev_sector = r10_bio->devs[i].addr;
+ int bad_sectors;
+ int is_bad;
+
+ is_bad = is_badblock(rdev, dev_sector,
+ max_sectors,
+ &first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /* Mustn't write here until the bad block
+ * is acknowledged
+ */
+ atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (is_bad && first_bad <= dev_sector) {
+ /* Cannot write here at all */
+ bad_sectors -= (dev_sector - first_bad);
+ if (bad_sectors < max_sectors)
+ /* Mustn't write more than bad_sectors
+ * to other devices yet
+ */
+ max_sectors = bad_sectors;
+ /* We don't set R10BIO_Degraded as that
+ * only applies if the disk is missing,
+ * so it might be re-added, and we want to
+ * know to recover this chunk.
+ * In this case the device is here, and the
+ * fact that this chunk is not in-sync is
+ * recorded in the bad block log.
+ */
+ continue;
+ }
+ if (is_bad) {
+ int good_sectors = first_bad - dev_sector;
+ if (good_sectors < max_sectors)
+ max_sectors = good_sectors;
+ }
+ }
+ r10_bio->devs[i].bio = bio;
+ atomic_inc(&rdev->nr_pending);
}
rcu_read_unlock();
@@ -882,8 +1086,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
goto retry_write;
}
+ if (max_sectors < r10_bio->sectors) {
+ /* We are splitting this into multiple parts, so
+ * we need to prepare for allocating another r10_bio.
+ */
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ }
+ sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+
atomic_set(&r10_bio->remaining, 1);
- bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
+ bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
@@ -892,10 +1110,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
continue;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+ max_sectors);
r10_bio->devs[i].bio = mbio;
- mbio->bi_sector = r10_bio->devs[i].addr+
- conf->mirrors[d].rdev->data_offset;
+ mbio->bi_sector = (r10_bio->devs[i].addr+
+ conf->mirrors[d].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -920,6 +1140,21 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
+ if (sectors_handled < (bio->bi_size >> 9)) {
+ /* We need another r10_bio. It has already been counted
+ * in bio->bi_phys_segments.
+ */
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_sector + sectors_handled;
+ r10_bio->state = 0;
+ goto retry_write;
+ }
+
if (do_sync || !mddev->bitmap || !plugged)
md_wakeup_thread(mddev->thread);
return 0;
@@ -949,6 +1184,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
seq_printf(seq, "]");
}
+/* check if there are enough drives for
+ * every block to appear on atleast one.
+ * Don't consider the device numbered 'ignore'
+ * as we might be about to remove it.
+ */
+static int enough(conf_t *conf, int ignore)
+{
+ int first = 0;
+
+ do {
+ int n = conf->copies;
+ int cnt = 0;
+ while (n--) {
+ if (conf->mirrors[first].rdev &&
+ first != ignore)
+ cnt++;
+ first = (first+1) % conf->raid_disks;
+ }
+ if (cnt == 0)
+ return 0;
+ } while (first != 0);
+ return 1;
+}
+
static void error(mddev_t *mddev, mdk_rdev_t *rdev)
{
char b[BDEVNAME_SIZE];
@@ -961,13 +1220,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* else mark the drive as failed
*/
if (test_bit(In_sync, &rdev->flags)
- && conf->raid_disks-mddev->degraded == 1)
+ && !enough(conf, rdev->raid_disk))
/*
* Don't fail the drive, just return an IO error.
- * The test should really be more sophisticated than
- * "working_disks == 1", but it isn't critical, and
- * can wait until we do more sophisticated "is the drive
- * really dead" tests...
*/
return;
if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -980,6 +1235,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
+ set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
@@ -1022,27 +1278,6 @@ static void close_sync(conf_t *conf)
conf->r10buf_pool = NULL;
}
-/* check if there are enough drives for
- * every block to appear on atleast one
- */
-static int enough(conf_t *conf)
-{
- int first = 0;
-
- do {
- int n = conf->copies;
- int cnt = 0;
- while (n--) {
- if (conf->mirrors[first].rdev)
- cnt++;
- first = (first+1) % conf->raid_disks;
- }
- if (cnt == 0)
- return 0;
- } while (first != 0);
- return 1;
-}
-
static int raid10_spare_active(mddev_t *mddev)
{
int i;
@@ -1078,7 +1313,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
conf_t *conf = mddev->private;
int err = -EEXIST;
int mirror;
- mirror_info_t *p;
int first = 0;
int last = conf->raid_disks - 1;
@@ -1087,44 +1321,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
* very different from resync
*/
return -EBUSY;
- if (!enough(conf))
+ if (!enough(conf, -1))
return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
- if (rdev->saved_raid_disk >= 0 &&
- rdev->saved_raid_disk >= first &&
+ if (rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
else
mirror = first;
- for ( ; mirror <= last ; mirror++)
- if ( !(p=conf->mirrors+mirror)->rdev) {
-
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must
- * never risk violating it, so limit
- * ->max_segments to one lying with a single
- * page, as a one page request is never in
- * violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
+ for ( ; mirror <= last ; mirror++) {
+ mirror_info_t *p = &conf->mirrors[mirror];
+ if (p->recovery_disabled == mddev->recovery_disabled)
+ continue;
+ if (!p->rdev)
+ continue;
- p->head_position = 0;
- rdev->raid_disk = mirror;
- err = 0;
- if (rdev->saved_raid_disk != mirror)
- conf->fullsync = 1;
- rcu_assign_pointer(p->rdev, rdev);
- break;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ /* as we don't honour merge_bvec_fn, we must
+ * never risk violating it, so limit
+ * ->max_segments to one lying with a single
+ * page, as a one page request is never in
+ * violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
}
+ p->head_position = 0;
+ rdev->raid_disk = mirror;
+ err = 0;
+ if (rdev->saved_raid_disk != mirror)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
+ break;
+ }
+
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
@@ -1149,7 +1386,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
* is not possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
- enough(conf)) {
+ mddev->recovery_disabled != p->recovery_disabled &&
+ enough(conf, -1)) {
err = -EBUSY;
goto abort;
}
@@ -1174,24 +1412,18 @@ static void end_sync_read(struct bio *bio, int error)
{
r10bio_t *r10_bio = bio->bi_private;
conf_t *conf = r10_bio->mddev->private;
- int i,d;
+ int d;
- for (i=0; i<conf->copies; i++)
- if (r10_bio->devs[i].bio == bio)
- break;
- BUG_ON(i == conf->copies);
- update_head_pos(i, r10_bio);
- d = r10_bio->devs[i].devnum;
+ d = find_bio_disk(conf, r10_bio, bio, NULL);
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
set_bit(R10BIO_Uptodate, &r10_bio->state);
- else {
+ else
+ /* The write handler will notice the lack of
+ * R10BIO_Uptodate and record any errors etc
+ */
atomic_add(r10_bio->sectors,
&conf->mirrors[d].rdev->corrected_errors);
- if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
- md_error(r10_bio->mddev,
- conf->mirrors[d].rdev);
- }
/* for reconstruct, we always reschedule after a read.
* for resync, only after all reads
@@ -1206,40 +1438,60 @@ static void end_sync_read(struct bio *bio, int error)
}
}
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_request(r10bio_t *r10_bio)
{
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- r10bio_t *r10_bio = bio->bi_private;
mddev_t *mddev = r10_bio->mddev;
- conf_t *conf = mddev->private;
- int i,d;
-
- for (i = 0; i < conf->copies; i++)
- if (r10_bio->devs[i].bio == bio)
- break;
- d = r10_bio->devs[i].devnum;
- if (!uptodate)
- md_error(mddev, conf->mirrors[d].rdev);
-
- update_head_pos(i, r10_bio);
-
- rdev_dec_pending(conf->mirrors[d].rdev, mddev);
while (atomic_dec_and_test(&r10_bio->remaining)) {
if (r10_bio->master_bio == NULL) {
/* the primary of several recovery bios */
sector_t s = r10_bio->sectors;
- put_buf(r10_bio);
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+ test_bit(R10BIO_WriteError, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else
+ put_buf(r10_bio);
md_done_sync(mddev, s, 1);
break;
} else {
r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
- put_buf(r10_bio);
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+ test_bit(R10BIO_WriteError, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else
+ put_buf(r10_bio);
r10_bio = r10_bio2;
}
}
}
+static void end_sync_write(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r10bio_t *r10_bio = bio->bi_private;
+ mddev_t *mddev = r10_bio->mddev;
+ conf_t *conf = mddev->private;
+ int d;
+ sector_t first_bad;
+ int bad_sectors;
+ int slot;
+
+ d = find_bio_disk(conf, r10_bio, bio, &slot);
+
+ if (!uptodate) {
+ set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ } else if (is_badblock(conf->mirrors[d].rdev,
+ r10_bio->devs[slot].addr,
+ r10_bio->sectors,
+ &first_bad, &bad_sectors))
+ set_bit(R10BIO_MadeGood, &r10_bio->state);
+
+ rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+
+ end_sync_request(r10_bio);
+}
+
/*
* Note: sync and recover and handled very differently for raid10
* This code is for resync.
@@ -1299,11 +1551,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
if (j == vcnt)
continue;
mddev->resync_mismatches += r10_bio->sectors;
+ if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ /* Don't fix anything. */
+ continue;
}
- if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
- /* Don't fix anything. */
- continue;
- /* Ok, we need to write this bio
+ /* Ok, we need to write this bio, either to correct an
+ * inconsistency or to correct an unreadable block.
* First we need to fixup bv_offset, bv_len and
* bi_vecs, as the read request might have corrupted these
*/
@@ -1355,32 +1608,107 @@ done:
* The second for writing.
*
*/
+static void fix_recovery_read_error(r10bio_t *r10_bio)
+{
+ /* We got a read error during recovery.
+ * We repeat the read in smaller page-sized sections.
+ * If a read succeeds, write it to the new device or record
+ * a bad block if we cannot.
+ * If a read fails, record a bad block on both old and
+ * new devices.
+ */
+ mddev_t *mddev = r10_bio->mddev;
+ conf_t *conf = mddev->private;
+ struct bio *bio = r10_bio->devs[0].bio;
+ sector_t sect = 0;
+ int sectors = r10_bio->sectors;
+ int idx = 0;
+ int dr = r10_bio->devs[0].devnum;
+ int dw = r10_bio->devs[1].devnum;
+
+ while (sectors) {
+ int s = sectors;
+ mdk_rdev_t *rdev;
+ sector_t addr;
+ int ok;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ rdev = conf->mirrors[dr].rdev;
+ addr = r10_bio->devs[0].addr + sect,
+ ok = sync_page_io(rdev,
+ addr,
+ s << 9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false);
+ if (ok) {
+ rdev = conf->mirrors[dw].rdev;
+ addr = r10_bio->devs[1].addr + sect;
+ ok = sync_page_io(rdev,
+ addr,
+ s << 9,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE, false);
+ if (!ok)
+ set_bit(WriteErrorSeen, &rdev->flags);
+ }
+ if (!ok) {
+ /* We don't worry if we cannot set a bad block -
+ * it really is bad so there is no loss in not
+ * recording it yet
+ */
+ rdev_set_badblocks(rdev, addr, s, 0);
+
+ if (rdev != conf->mirrors[dw].rdev) {
+ /* need bad block on destination too */
+ mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
+ addr = r10_bio->devs[1].addr + sect;
+ ok = rdev_set_badblocks(rdev2, addr, s, 0);
+ if (!ok) {
+ /* just abort the recovery */
+ printk(KERN_NOTICE
+ "md/raid10:%s: recovery aborted"
+ " due to read error\n",
+ mdname(mddev));
+
+ conf->mirrors[dw].recovery_disabled
+ = mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery);
+ break;
+ }
+ }
+ }
+
+ sectors -= s;
+ sect += s;
+ idx++;
+ }
+}
static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
{
conf_t *conf = mddev->private;
- int i, d;
- struct bio *bio, *wbio;
+ int d;
+ struct bio *wbio;
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
+ fix_recovery_read_error(r10_bio);
+ end_sync_request(r10_bio);
+ return;
+ }
- /* move the pages across to the second bio
+ /*
+ * share the pages with the first bio
* and submit the write request
*/
- bio = r10_bio->devs[0].bio;
wbio = r10_bio->devs[1].bio;
- for (i=0; i < wbio->bi_vcnt; i++) {
- struct page *p = bio->bi_io_vec[i].bv_page;
- bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
- wbio->bi_io_vec[i].bv_page = p;
- }
d = r10_bio->devs[1].devnum;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
- if (test_bit(R10BIO_Uptodate, &r10_bio->state))
- generic_make_request(wbio);
- else
- bio_endio(wbio, -EIO);
+ generic_make_request(wbio);
}
@@ -1421,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
}
+static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+ int sectors, struct page *page, int rw)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
+ && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
+ return -1;
+ if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+ /* success */
+ return 1;
+ if (rw == WRITE)
+ set_bit(WriteErrorSeen, &rdev->flags);
+ /* need to record an error - either for the block or the device */
+ if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+ md_error(rdev->mddev, rdev);
+ return 0;
+}
+
/*
* This is a kernel thread which:
*
@@ -1476,10 +1824,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_lock();
do {
+ sector_t first_bad;
+ int bad_sectors;
+
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
+ test_bit(In_sync, &rdev->flags) &&
+ is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
+ &first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
success = sync_page_io(rdev,
@@ -1499,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
if (!success) {
- /* Cannot read from anywhere -- bye bye array */
+ /* Cannot read from anywhere, just mark the block
+ * as bad on the first device to discourage future
+ * reads.
+ */
int dn = r10_bio->devs[r10_bio->read_slot].devnum;
- md_error(mddev, conf->mirrors[dn].rdev);
+ rdev = conf->mirrors[dn].rdev;
+
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[r10_bio->read_slot].addr
+ + sect,
+ s, 0))
+ md_error(mddev, rdev);
break;
}
@@ -1516,80 +1879,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
sl--;
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- atomic_add(s, &rdev->corrected_errors);
- if (sync_page_io(rdev,
- r10_bio->devs[sl].addr +
- sect,
- s<<9, conf->tmppage, WRITE, false)
- == 0) {
- /* Well, this device is dead */
- printk(KERN_NOTICE
- "md/raid10:%s: read correction "
- "write failed"
- " (%d sectors at %llu on %s)\n",
- mdname(mddev), s,
- (unsigned long long)(
- sect + rdev->data_offset),
- bdevname(rdev->bdev, b));
- printk(KERN_NOTICE "md/raid10:%s: %s: failing "
- "drive\n",
- mdname(mddev),
- bdevname(rdev->bdev, b));
- md_error(mddev, rdev);
- }
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
+ if (!rdev ||
+ !test_bit(In_sync, &rdev->flags))
+ continue;
+
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ if (r10_sync_page_io(rdev,
+ r10_bio->devs[sl].addr +
+ sect,
+ s<<9, conf->tmppage, WRITE)
+ == 0) {
+ /* Well, this device is dead */
+ printk(KERN_NOTICE
+ "md/raid10:%s: read correction "
+ "write failed"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(
+ sect + rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ printk(KERN_NOTICE "md/raid10:%s: %s: failing "
+ "drive\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b));
}
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
}
sl = start;
while (sl != r10_bio->read_slot) {
+ char b[BDEVNAME_SIZE];
if (sl==0)
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- if (sync_page_io(rdev,
- r10_bio->devs[sl].addr +
- sect,
- s<<9, conf->tmppage,
- READ, false) == 0) {
- /* Well, this device is dead */
- printk(KERN_NOTICE
- "md/raid10:%s: unable to read back "
- "corrected sectors"
- " (%d sectors at %llu on %s)\n",
- mdname(mddev), s,
- (unsigned long long)(
- sect + rdev->data_offset),
- bdevname(rdev->bdev, b));
- printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
- mdname(mddev),
- bdevname(rdev->bdev, b));
-
- md_error(mddev, rdev);
- } else {
- printk(KERN_INFO
- "md/raid10:%s: read error corrected"
- " (%d sectors at %llu on %s)\n",
- mdname(mddev), s,
- (unsigned long long)(
- sect + rdev->data_offset),
- bdevname(rdev->bdev, b));
- }
+ if (!rdev ||
+ !test_bit(In_sync, &rdev->flags))
+ continue;
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ switch (r10_sync_page_io(rdev,
+ r10_bio->devs[sl].addr +
+ sect,
+ s<<9, conf->tmppage,
+ READ)) {
+ case 0:
+ /* Well, this device is dead */
+ printk(KERN_NOTICE
+ "md/raid10:%s: unable to read back "
+ "corrected sectors"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(
+ sect + rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ printk(KERN_NOTICE "md/raid10:%s: %s: failing "
+ "drive\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b));
+ break;
+ case 1:
+ printk(KERN_INFO
+ "md/raid10:%s: read error corrected"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(
+ sect + rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ atomic_add(s, &rdev->corrected_errors);
}
+
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
}
rcu_read_unlock();
@@ -1598,21 +1963,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
}
}
+static void bi_complete(struct bio *bio, int error)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static int submit_bio_wait(int rw, struct bio *bio)
+{
+ struct completion event;
+ rw |= REQ_SYNC;
+
+ init_completion(&event);
+ bio->bi_private = &event;
+ bio->bi_end_io = bi_complete;
+ submit_bio(rw, bio);
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio->bi_flags);
+}
+
+static int narrow_write_error(r10bio_t *r10_bio, int i)
+{
+ struct bio *bio = r10_bio->master_bio;
+ mddev_t *mddev = r10_bio->mddev;
+ conf_t *conf = mddev->private;
+ mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
+ /* bio has the data to be written to slot 'i' where
+ * we just recently had a write error.
+ * We repeatedly clone the bio and trim down to one block,
+ * then try the write. Where the write fails we record
+ * a bad block.
+ * It is conceivable that the bio doesn't exactly align with
+ * blocks. We must handle this.
+ *
+ * We currently own a reference to the rdev.
+ */
+
+ int block_sectors;
+ sector_t sector;
+ int sectors;
+ int sect_to_write = r10_bio->sectors;
+ int ok = 1;
+
+ if (rdev->badblocks.shift < 0)
+ return 0;
+
+ block_sectors = 1 << rdev->badblocks.shift;
+ sector = r10_bio->sector;
+ sectors = ((r10_bio->sector + block_sectors)
+ & ~(sector_t)(block_sectors - 1))
+ - sector;
+
+ while (sect_to_write) {
+ struct bio *wbio;
+ if (sectors > sect_to_write)
+ sectors = sect_to_write;
+ /* Write at 'sector' for 'sectors' */
+ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(wbio, sector - bio->bi_sector, sectors);
+ wbio->bi_sector = (r10_bio->devs[i].addr+
+ rdev->data_offset+
+ (sector - r10_bio->sector));
+ wbio->bi_bdev = rdev->bdev;
+ if (submit_bio_wait(WRITE, wbio) == 0)
+ /* Failure! */
+ ok = rdev_set_badblocks(rdev, sector,
+ sectors, 0)
+ && ok;
+
+ bio_put(wbio);
+ sect_to_write -= sectors;
+ sector += sectors;
+ sectors = block_sectors;
+ }
+ return ok;
+}
+
+static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
+{
+ int slot = r10_bio->read_slot;
+ int mirror = r10_bio->devs[slot].devnum;
+ struct bio *bio;
+ conf_t *conf = mddev->private;
+ mdk_rdev_t *rdev;
+ char b[BDEVNAME_SIZE];
+ unsigned long do_sync;
+ int max_sectors;
+
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen.
+ */
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, mddev, r10_bio);
+ unfreeze_array(conf);
+ }
+ rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
+
+ bio = r10_bio->devs[slot].bio;
+ bdevname(bio->bi_bdev, b);
+ r10_bio->devs[slot].bio =
+ mddev->ro ? IO_BLOCKED : NULL;
+read_more:
+ mirror = read_balance(conf, r10_bio, &max_sectors);
+ if (mirror == -1) {
+ printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
+ " read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ raid_end_bio_io(r10_bio);
+ bio_put(bio);
+ return;
+ }
+
+ do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
+ if (bio)
+ bio_put(bio);
+ slot = r10_bio->read_slot;
+ rdev = conf->mirrors[mirror].rdev;
+ printk_ratelimited(
+ KERN_ERR
+ "md/raid10:%s: %s: redirecting"
+ "sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ bio = bio_clone_mddev(r10_bio->master_bio,
+ GFP_NOIO, mddev);
+ md_trim_bio(bio,
+ r10_bio->sector - bio->bi_sector,
+ max_sectors);
+ r10_bio->devs[slot].bio = bio;
+ bio->bi_sector = r10_bio->devs[slot].addr
+ + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_rw = READ | do_sync;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = raid10_end_read_request;
+ if (max_sectors < r10_bio->sectors) {
+ /* Drat - have to split this up more */
+ struct bio *mbio = r10_bio->master_bio;
+ int sectors_handled =
+ r10_bio->sector + max_sectors
+ - mbio->bi_sector;
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (mbio->bi_phys_segments == 0)
+ mbio->bi_phys_segments = 2;
+ else
+ mbio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ generic_make_request(bio);
+ bio = NULL;
+
+ r10_bio = mempool_alloc(conf->r10bio_pool,
+ GFP_NOIO);
+ r10_bio->master_bio = mbio;
+ r10_bio->sectors = (mbio->bi_size >> 9)
+ - sectors_handled;
+ r10_bio->state = 0;
+ set_bit(R10BIO_ReadError,
+ &r10_bio->state);
+ r10_bio->mddev = mddev;
+ r10_bio->sector = mbio->bi_sector
+ + sectors_handled;
+
+ goto read_more;
+ } else
+ generic_make_request(bio);
+}
+
+static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
+{
+ /* Some sort of write request has finished and it
+ * succeeded in writing where we thought there was a
+ * bad block. So forget the bad block.
+ * Or possibly if failed and we need to record
+ * a bad block.
+ */
+ int m;
+ mdk_rdev_t *rdev;
+
+ if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
+ test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+ for (m = 0; m < conf->copies; m++) {
+ int dev = r10_bio->devs[m].devnum;
+ rdev = conf->mirrors[dev].rdev;
+ if (r10_bio->devs[m].bio == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE,
+ &r10_bio->devs[m].bio->bi_flags)) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors);
+ } else {
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0))
+ md_error(conf->mddev, rdev);
+ }
+ }
+ put_buf(r10_bio);
+ } else {
+ for (m = 0; m < conf->copies; m++) {
+ int dev = r10_bio->devs[m].devnum;
+ struct bio *bio = r10_bio->devs[m].bio;
+ rdev = conf->mirrors[dev].rdev;
+ if (bio == IO_MADE_GOOD) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors);
+ rdev_dec_pending(rdev, conf->mddev);
+ } else if (bio != NULL &&
+ !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (!narrow_write_error(r10_bio, m)) {
+ md_error(conf->mddev, rdev);
+ set_bit(R10BIO_Degraded,
+ &r10_bio->state);
+ }
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ }
+ if (test_bit(R10BIO_WriteError,
+ &r10_bio->state))
+ close_write(r10_bio);
+ raid_end_bio_io(r10_bio);
+ }
+}
+
static void raid10d(mddev_t *mddev)
{
r10bio_t *r10_bio;
- struct bio *bio;
unsigned long flags;
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
- mdk_rdev_t *rdev;
struct blk_plug plug;
md_check_recovery(mddev);
blk_start_plug(&plug);
for (;;) {
- char b[BDEVNAME_SIZE];
flush_pending_writes(conf);
@@ -1628,64 +2226,26 @@ static void raid10d(mddev_t *mddev)
mddev = r10_bio->mddev;
conf = mddev->private;
- if (test_bit(R10BIO_IsSync, &r10_bio->state))
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+ test_bit(R10BIO_WriteError, &r10_bio->state))
+ handle_write_completed(conf, r10_bio);
+ else if (test_bit(R10BIO_IsSync, &r10_bio->state))
sync_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio);
+ else if (test_bit(R10BIO_ReadError, &r10_bio->state))
+ handle_read_error(mddev, r10_bio);
else {
- int slot = r10_bio->read_slot;
- int mirror = r10_bio->devs[slot].devnum;
- /* we got a read error. Maybe the drive is bad. Maybe just
- * the block and we can fix it.
- * We freeze all other IO, and try reading the block from
- * other devices. When we find one, we re-write
- * and check it that fixes the read error.
- * This is all done synchronously while the array is
- * frozen.
+ /* just a partial read to be scheduled from a
+ * separate context
*/
- if (mddev->ro == 0) {
- freeze_array(conf);
- fix_read_error(conf, mddev, r10_bio);
- unfreeze_array(conf);
- }
- rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
-
- bio = r10_bio->devs[slot].bio;
- r10_bio->devs[slot].bio =
- mddev->ro ? IO_BLOCKED : NULL;
- mirror = read_balance(conf, r10_bio);
- if (mirror == -1) {
- printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
- " read error for block %llu\n",
- mdname(mddev),
- bdevname(bio->bi_bdev,b),
- (unsigned long long)r10_bio->sector);
- raid_end_bio_io(r10_bio);
- bio_put(bio);
- } else {
- const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
- bio_put(bio);
- slot = r10_bio->read_slot;
- rdev = conf->mirrors[mirror].rdev;
- if (printk_ratelimit())
- printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
- " another mirror\n",
- mdname(mddev),
- bdevname(rdev->bdev,b),
- (unsigned long long)r10_bio->sector);
- bio = bio_clone_mddev(r10_bio->master_bio,
- GFP_NOIO, mddev);
- r10_bio->devs[slot].bio = bio;
- bio->bi_sector = r10_bio->devs[slot].addr
- + rdev->data_offset;
- bio->bi_bdev = rdev->bdev;
- bio->bi_rw = READ | do_sync;
- bio->bi_private = r10_bio;
- bio->bi_end_io = raid10_end_read_request;
- generic_make_request(bio);
- }
+ int slot = r10_bio->read_slot;
+ generic_make_request(r10_bio->devs[slot].bio);
}
+
cond_resched();
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+ md_check_recovery(mddev);
}
blk_finish_plug(&plug);
}
@@ -1746,7 +2306,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
int i;
int max_sync;
sector_t sync_blocks;
-
sector_t sectors_skipped = 0;
int chunks_skipped = 0;
@@ -1828,7 +2387,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* recovery... the complicated one */
- int j, k;
+ int j;
r10_bio = NULL;
for (i=0 ; i<conf->raid_disks; i++) {
@@ -1836,6 +2395,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
r10bio_t *rb2;
sector_t sect;
int must_sync;
+ int any_working;
if (conf->mirrors[i].rdev == NULL ||
test_bit(In_sync, &conf->mirrors[i].rdev->flags))
@@ -1887,19 +2447,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
+ any_working = 0;
for (j=0; j<conf->copies;j++) {
+ int k;
int d = r10_bio->devs[j].devnum;
+ sector_t from_addr, to_addr;
+ mdk_rdev_t *rdev;
+ sector_t sector, first_bad;
+ int bad_sectors;
if (!conf->mirrors[d].rdev ||
!test_bit(In_sync, &conf->mirrors[d].rdev->flags))
continue;
/* This is where we read from */
+ any_working = 1;
+ rdev = conf->mirrors[d].rdev;
+ sector = r10_bio->devs[j].addr;
+
+ if (is_badblock(rdev, sector, max_sync,
+ &first_bad, &bad_sectors)) {
+ if (first_bad > sector)
+ max_sync = first_bad - sector;
+ else {
+ bad_sectors -= (sector
+ - first_bad);
+ if (max_sync > bad_sectors)
+ max_sync = bad_sectors;
+ continue;
+ }
+ }
bio = r10_bio->devs[0].bio;
bio->bi_next = biolist;
biolist = bio;
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio->bi_rw = READ;
- bio->bi_sector = r10_bio->devs[j].addr +
+ from_addr = r10_bio->devs[j].addr;
+ bio->bi_sector = from_addr +
conf->mirrors[d].rdev->data_offset;
bio->bi_bdev = conf->mirrors[d].rdev->bdev;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -1916,26 +2499,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE;
- bio->bi_sector = r10_bio->devs[k].addr +
+ to_addr = r10_bio->devs[k].addr;
+ bio->bi_sector = to_addr +
conf->mirrors[i].rdev->data_offset;
bio->bi_bdev = conf->mirrors[i].rdev->bdev;
r10_bio->devs[0].devnum = d;
+ r10_bio->devs[0].addr = from_addr;
r10_bio->devs[1].devnum = i;
+ r10_bio->devs[1].addr = to_addr;
break;
}
if (j == conf->copies) {
- /* Cannot recover, so abort the recovery */
+ /* Cannot recover, so abort the recovery or
+ * record a bad block */
put_buf(r10_bio);
if (rb2)
atomic_dec(&rb2->remaining);
r10_bio = rb2;
- if (!test_and_set_bit(MD_RECOVERY_INTR,
- &mddev->recovery))
- printk(KERN_INFO "md/raid10:%s: insufficient "
- "working devices for recovery.\n",
- mdname(mddev));
+ if (any_working) {
+ /* problem is that there are bad blocks
+ * on other device(s)
+ */
+ int k;
+ for (k = 0; k < conf->copies; k++)
+ if (r10_bio->devs[k].devnum == i)
+ break;
+ if (!rdev_set_badblocks(
+ conf->mirrors[i].rdev,
+ r10_bio->devs[k].addr,
+ max_sync, 0))
+ any_working = 0;
+ }
+ if (!any_working) {
+ if (!test_and_set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery))
+ printk(KERN_INFO "md/raid10:%s: insufficient "
+ "working devices for recovery.\n",
+ mdname(mddev));
+ conf->mirrors[i].recovery_disabled
+ = mddev->recovery_disabled;
+ }
break;
}
}
@@ -1979,12 +2584,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
for (i=0; i<conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
+ sector_t first_bad, sector;
+ int bad_sectors;
+
bio = r10_bio->devs[i].bio;
bio->bi_end_io = NULL;
clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
continue;
+ sector = r10_bio->devs[i].addr;
+ if (is_badblock(conf->mirrors[d].rdev,
+ sector, max_sync,
+ &first_bad, &bad_sectors)) {
+ if (first_bad > sector)
+ max_sync = first_bad - sector;
+ else {
+ bad_sectors -= (sector - first_bad);
+ if (max_sync > bad_sectors)
+ max_sync = max_sync;
+ continue;
+ }
+ }
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
bio->bi_next = biolist;
@@ -1992,7 +2613,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio->bi_rw = READ;
- bio->bi_sector = r10_bio->devs[i].addr +
+ bio->bi_sector = sector +
conf->mirrors[d].rdev->data_offset;
bio->bi_bdev = conf->mirrors[d].rdev->bdev;
count++;
@@ -2079,7 +2700,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
return sectors_skipped + nr_sectors;
giveup:
/* There is nowhere to write, so all non-sync
- * drives must be failed, so try the next chunk...
+ * drives must be failed or in resync, all drives
+ * have a bad block, so try the next chunk...
*/
if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
@@ -2249,6 +2871,7 @@ static int run(mddev_t *mddev)
(conf->raid_disks / conf->near_copies));
list_for_each_entry(rdev, &mddev->disks, same_set) {
+
disk_idx = rdev->raid_disk;
if (disk_idx >= conf->raid_disks
|| disk_idx < 0)
@@ -2271,7 +2894,7 @@ static int run(mddev_t *mddev)
disk->head_position = 0;
}
/* need to check that every block has at least one working mirror */
- if (!enough(conf)) {
+ if (!enough(conf, -1)) {
printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
mdname(mddev));
goto out_free_conf;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 944b1104d3b..79cb52a0d4a 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
struct mirror_info {
mdk_rdev_t *rdev;
sector_t head_position;
+ int recovery_disabled; /* matches
+ * mddev->recovery_disabled
+ * when we shouldn't try
+ * recovering this device.
+ */
};
typedef struct r10bio_s r10bio_t;
@@ -113,10 +118,26 @@ struct r10bio_s {
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio*)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */
#define R10BIO_Uptodate 0
#define R10BIO_IsSync 1
#define R10BIO_IsRecover 2
#define R10BIO_Degraded 3
+/* Set ReadError on bios that experience a read error
+ * so that raid10d knows what to do with them.
+ */
+#define R10BIO_ReadError 4
+/* If a write for this request means we can clear some
+ * known-bad-block records, we set this flag.
+ */
+#define R10BIO_MadeGood 5
+#define R10BIO_WriteError 6
#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b72edf35ec5..dbae459fb02 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
#include <linux/seq_file.h>
#include <linux/cpu.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include "md.h"
#include "raid5.h"
#include "raid0.h"
@@ -96,8 +97,6 @@
#define __inline__
#endif
-#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
-
/*
* We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
(unsigned long long)sh->sector, i, dev->toread,
dev->read, dev->towrite, dev->written,
test_bit(R5_LOCKED, &dev->flags));
- BUG();
+ WARN_ON(1);
}
dev->flags = 0;
raid5_build_block(sh, i, previous);
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
+ /* We have already checked bad blocks for reads. Now
+ * need to check for writes.
+ */
+ while ((rw & WRITE) && rdev &&
+ test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ int bad_sectors;
+ int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors);
+ if (!bad)
+ break;
+
+ if (bad < 0) {
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ if (!conf->mddev->external &&
+ conf->mddev->flags) {
+ /* It is very unlikely, but we might
+ * still need to write out the
+ * bad block log - better give it
+ * a chance*/
+ md_check_recovery(conf->mddev);
+ }
+ md_wait_for_blocked_rdev(rdev, conf->mddev);
+ } else {
+ /* Acknowledged bad block - skip the write */
+ rdev_dec_pending(rdev, conf->mddev);
+ rdev = NULL;
+ }
+ }
+
if (rdev) {
if (s->syncing || s->expanding || s->expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
bi->bi_next = NULL;
- if ((rw & WRITE) &&
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS,
- &rdev->corrected_errors);
generic_make_request(bi);
} else {
if (rw & WRITE)
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi;
- spin_lock(&sh->lock);
+ spin_lock_irq(&sh->raid_conf->device_lock);
chosen = dev->towrite;
dev->towrite = NULL;
BUG_ON(dev->written);
wbi = dev->written = chosen;
- spin_unlock(&sh->lock);
+ spin_unlock_irq(&sh->raid_conf->device_lock);
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
static int grow_one_stripe(raid5_conf_t *conf)
{
struct stripe_head *sh;
- sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
+ sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
if (!sh)
return 0;
- memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
+
sh->raid_conf = conf;
- spin_lock_init(&sh->lock);
#ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&sh->ops.wait_for_ops);
#endif
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
return -ENOMEM;
for (i = conf->max_nr_stripes; i; i--) {
- nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+ nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
if (!nsh)
break;
- memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
-
nsh->raid_conf = conf;
- spin_lock_init(&nsh->lock);
#ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&nsh->ops.wait_for_ops);
#endif
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
rdev = conf->disks[i].rdev;
- printk_rl(KERN_INFO "md/raid:%s: read error corrected"
- " (%lu sectors at %llu on %s)\n",
- mdname(conf->mddev), STRIPE_SECTORS,
- (unsigned long long)(sh->sector
- + rdev->data_offset),
- bdevname(rdev->bdev, b));
+ printk_ratelimited(
+ KERN_INFO
+ "md/raid:%s: read error corrected"
+ " (%lu sectors at %llu on %s)\n",
+ mdname(conf->mddev), STRIPE_SECTORS,
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded >= conf->max_degraded)
- printk_rl(KERN_WARNING
- "md/raid:%s: read error not correctable "
- "(sector %llu on %s).\n",
- mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
- bdn);
+ printk_ratelimited(
+ KERN_WARNING
+ "md/raid:%s: read error not correctable "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
- printk_rl(KERN_WARNING
- "md/raid:%s: read error NOT corrected!! "
- "(sector %llu on %s).\n",
- mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
- bdn);
+ printk_ratelimited(
+ KERN_WARNING
+ "md/raid:%s: read error NOT corrected!! "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdn);
else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
printk(KERN_WARNING
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ sector_t first_bad;
+ int bad_sectors;
for (i=0 ; i<disks; i++)
if (bi == &sh->dev[i].req)
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error)
return;
}
- if (!uptodate)
- md_error(conf->mddev, conf->disks[i].rdev);
+ if (!uptodate) {
+ set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
+ set_bit(R5_WriteError, &sh->dev[i].flags);
+ } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors))
+ set_bit(R5_MadeGood, &sh->dev[i].flags);
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
+ set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
/*
* Select the parity disk based on the user selected algorithm.
*/
- pd_idx = qd_idx = ~0;
+ pd_idx = qd_idx = -1;
switch(conf->level) {
case 4:
pd_idx = data_disks;
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
raid5_conf_t *conf = sh->raid_conf;
int firstwrite=0;
- pr_debug("adding bh b#%llu to stripe s#%llu\n",
+ pr_debug("adding bi b#%llu to stripe s#%llu\n",
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector);
- spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock);
if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
bi->bi_next = *bip;
*bip = bi;
bi->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- spin_unlock(&sh->lock);
-
- pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
- (unsigned long long)bi->bi_sector,
- (unsigned long long)sh->sector, dd_idx);
-
- if (conf->mddev->bitmap && firstwrite) {
- bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0);
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
- }
if (forwrite) {
/* check if page is covered */
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
+ spin_unlock_irq(&conf->device_lock);
+
+ pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
+ (unsigned long long)(*bip)->bi_sector,
+ (unsigned long long)sh->sector, dd_idx);
+
+ if (conf->mddev->bitmap && firstwrite) {
+ bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0);
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
return 1;
overlap:
set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
spin_unlock_irq(&conf->device_lock);
- spin_unlock(&sh->lock);
return 0;
}
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(In_sync, &rdev->flags))
- /* multiple read failures in one stripe */
- md_error(conf->mddev, rdev);
+ atomic_inc(&rdev->nr_pending);
+ else
+ rdev = NULL;
rcu_read_unlock();
+ if (rdev) {
+ if (!rdev_set_badblocks(
+ rdev,
+ sh->sector,
+ STRIPE_SECTORS, 0))
+ md_error(conf->mddev, rdev);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
}
spin_lock_irq(&conf->device_lock);
/* fail all writes first */
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
+ /* If we were in the middle of a write the parity block might
+ * still be locked - so just clear all R5_LOCKED flags
+ */
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
}
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
md_wakeup_thread(conf->mddev->thread);
}
-/* fetch_block5 - checks the given member device to see if its data needs
- * to be read or computed to satisfy a request.
- *
- * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill5 to continue
- */
-static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
- int disk_idx, int disks)
-{
- struct r5dev *dev = &sh->dev[disk_idx];
- struct r5dev *failed_dev = &sh->dev[s->failed_num];
-
- /* is the data in this block needed, and can we get it? */
- if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- (dev->toread ||
- (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding ||
- (s->failed &&
- (failed_dev->toread ||
- (failed_dev->towrite &&
- !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
- /* We would like to get this block, possibly by computing it,
- * otherwise read it if the backing disk is insync
- */
- if ((s->uptodate == disks - 1) &&
- (s->failed && disk_idx == s->failed_num)) {
- set_bit(STRIPE_COMPUTE_RUN, &sh->state);
- set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
- set_bit(R5_Wantcompute, &dev->flags);
- sh->ops.target = disk_idx;
- sh->ops.target2 = -1;
- s->req_compute = 1;
- /* Careful: from this point on 'uptodate' is in the eye
- * of raid_run_ops which services 'compute' operations
- * before writes. R5_Wantcompute flags a block that will
- * be R5_UPTODATE by the time it is needed for a
- * subsequent operation.
- */
- s->uptodate++;
- return 1; /* uptodate + compute == disks */
- } else if (test_bit(R5_Insync, &dev->flags)) {
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- s->locked++;
- pr_debug("Reading block %d (sync=%d)\n", disk_idx,
- s->syncing);
- }
- }
-
- return 0;
-}
-
-/**
- * handle_stripe_fill5 - read or compute data to satisfy pending requests.
- */
-static void handle_stripe_fill5(struct stripe_head *sh,
- struct stripe_head_state *s, int disks)
+static void
+handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
+ struct stripe_head_state *s)
{
+ int abort = 0;
int i;
- /* look for blocks to read/compute, skip this if a compute
- * is already in flight, or if the stripe contents are in the
- * midst of changing due to a write
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ s->syncing = 0;
+ /* There is nothing more to do for sync/check/repair.
+ * For recover we need to record a bad block on all
+ * non-sync devices, or abort the recovery
*/
- if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
- !sh->reconstruct_state)
- for (i = disks; i--; )
- if (fetch_block5(sh, s, i, disks))
- break;
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
+ return;
+ /* During recovery devices cannot be removed, so locking and
+ * refcounting of rdevs is not needed
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev = conf->disks[i].rdev;
+ if (!rdev
+ || test_bit(Faulty, &rdev->flags)
+ || test_bit(In_sync, &rdev->flags))
+ continue;
+ if (!rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
+ abort = 1;
+ }
+ if (abort) {
+ conf->recovery_disabled = conf->mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
+ }
}
-/* fetch_block6 - checks the given member device to see if its data needs
+/* fetch_block - checks the given member device to see if its data needs
* to be read or computed to satisfy a request.
*
* Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill6 to continue
+ * 0 to tell the loop in handle_stripe_fill to continue
*/
-static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
- struct r6_state *r6s, int disk_idx, int disks)
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
- struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
- &sh->dev[r6s->failed_num[1]] };
+ struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
+ &sh->dev[s->failed_num[1]] };
+ /* is the data in this block needed, and can we get it? */
if (!test_bit(R5_LOCKED, &dev->flags) &&
!test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
s->syncing || s->expanding ||
- (s->failed >= 1 &&
- (fdev[0]->toread || s->to_write)) ||
- (s->failed >= 2 &&
- (fdev[1]->toread || s->to_write)))) {
+ (s->failed >= 1 && fdev[0]->toread) ||
+ (s->failed >= 2 && fdev[1]->toread) ||
+ (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
+ !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
+ (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
if ((s->uptodate == disks - 1) &&
- (s->failed && (disk_idx == r6s->failed_num[0] ||
- disk_idx == r6s->failed_num[1]))) {
+ (s->failed && (disk_idx == s->failed_num[0] ||
+ disk_idx == s->failed_num[1]))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
sh->ops.target = disk_idx;
sh->ops.target2 = -1; /* no 2nd target */
s->req_compute = 1;
+ /* Careful: from this point on 'uptodate' is in the eye
+ * of raid_run_ops which services 'compute' operations
+ * before writes. R5_Wantcompute flags a block that will
+ * be R5_UPTODATE by the time it is needed for a
+ * subsequent operation.
+ */
s->uptodate++;
return 1;
} else if (s->uptodate == disks-2 && s->failed >= 2) {
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
}
/**
- * handle_stripe_fill6 - read or compute data to satisfy pending requests.
+ * handle_stripe_fill - read or compute data to satisfy pending requests.
*/
-static void handle_stripe_fill6(struct stripe_head *sh,
- struct stripe_head_state *s, struct r6_state *r6s,
- int disks)
+static void handle_stripe_fill(struct stripe_head *sh,
+ struct stripe_head_state *s,
+ int disks)
{
int i;
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
!sh->reconstruct_state)
for (i = disks; i--; )
- if (fetch_block6(sh, s, r6s, i, disks))
+ if (fetch_block(sh, s, i, disks))
break;
set_bit(STRIPE_HANDLE, &sh->state);
}
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
md_wakeup_thread(conf->mddev->thread);
}
-static void handle_stripe_dirtying5(raid5_conf_t *conf,
- struct stripe_head *sh, struct stripe_head_state *s, int disks)
+static void handle_stripe_dirtying(raid5_conf_t *conf,
+ struct stripe_head *sh,
+ struct stripe_head_state *s,
+ int disks)
{
int rmw = 0, rcw = 0, i;
- for (i = disks; i--; ) {
+ if (conf->max_degraded == 2) {
+ /* RAID6 requires 'rcw' in current implementation
+ * Calculate the real rcw later - for now fake it
+ * look like rcw is cheaper
+ */
+ rcw = 1; rmw = 2;
+ } else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
}
}
}
- if (rcw <= rmw && rcw > 0)
+ if (rcw <= rmw && rcw > 0) {
/* want reconstruct write, but need to get some data */
+ rcw = 0;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (!test_bit(R5_OVERWRITE, &dev->flags) &&
- i != sh->pd_idx &&
+ i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Wantcompute, &dev->flags)) &&
- test_bit(R5_Insync, &dev->flags)) {
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ rcw++;
+ if (!test_bit(R5_Insync, &dev->flags))
+ continue; /* it's a failed drive */
if (
test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
pr_debug("Read_old block "
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
}
}
}
+ }
/* now if nothing is locked, and if we have enough data,
* we can start a write request
*/
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
schedule_reconstruction(sh, s, rcw == 0, 0);
}
-static void handle_stripe_dirtying6(raid5_conf_t *conf,
- struct stripe_head *sh, struct stripe_head_state *s,
- struct r6_state *r6s, int disks)
-{
- int rcw = 0, pd_idx = sh->pd_idx, i;
- int qd_idx = sh->qd_idx;
-
- set_bit(STRIPE_HANDLE, &sh->state);
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- /* check if we haven't enough data */
- if (!test_bit(R5_OVERWRITE, &dev->flags) &&
- i != pd_idx && i != qd_idx &&
- !test_bit(R5_LOCKED, &dev->flags) &&
- !(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Wantcompute, &dev->flags))) {
- rcw++;
- if (!test_bit(R5_Insync, &dev->flags))
- continue; /* it's a failed drive */
-
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- pr_debug("Read_old stripe %llu "
- "block %d for Reconstruct\n",
- (unsigned long long)sh->sector, i);
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- s->locked++;
- } else {
- pr_debug("Request delayed stripe %llu "
- "block %d for Reconstruct\n",
- (unsigned long long)sh->sector, i);
- set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
- }
- }
- /* now if nothing is locked, and if we have enough data, we can start a
- * write request
- */
- if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
- s->locked == 0 && rcw == 0 &&
- !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
- schedule_reconstruction(sh, s, 1, 0);
- }
-}
-
static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
s->uptodate--;
break;
}
- dev = &sh->dev[s->failed_num];
+ dev = &sh->dev[s->failed_num[0]];
/* fall through */
case check_state_compute_result:
sh->check_state = check_state_idle;
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s,
- struct r6_state *r6s, int disks)
+ int disks)
{
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
switch (sh->check_state) {
case check_state_idle:
/* start a new check operation if there are < 2 failures */
- if (s->failed == r6s->q_failed) {
+ if (s->failed == s->q_failed) {
/* The only possible failed device holds Q, so it
* makes sense to check P (If anything else were failed,
* we would have used P to recreate it).
*/
sh->check_state = check_state_run;
}
- if (!r6s->q_failed && s->failed < 2) {
+ if (!s->q_failed && s->failed < 2) {
/* Q is not failed, and we didn't use it to generate
* anything, so it makes sense to check it
*/
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
*/
BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
if (s->failed == 2) {
- dev = &sh->dev[r6s->failed_num[1]];
+ dev = &sh->dev[s->failed_num[1]];
s->locked++;
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
}
if (s->failed >= 1) {
- dev = &sh->dev[r6s->failed_num[0]];
+ dev = &sh->dev[s->failed_num[0]];
s->locked++;
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
}
}
-static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
- struct r6_state *r6s)
+static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
{
int i;
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j = 0; j < conf->raid_disks; j++)
if (j != sh2->pd_idx &&
- (!r6s || j != sh2->qd_idx) &&
+ j != sh2->qd_idx &&
!test_bit(R5_Expanded, &sh2->dev[j].flags))
break;
if (j == conf->raid_disks) {
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
*
*/
-static void handle_stripe5(struct stripe_head *sh)
+static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
{
raid5_conf_t *conf = sh->raid_conf;
- int disks = sh->disks, i;
- struct bio *return_bi = NULL;
- struct stripe_head_state s;
+ int disks = sh->disks;
struct r5dev *dev;
- mdk_rdev_t *blocked_rdev = NULL;
- int prexor;
- int dec_preread_active = 0;
+ int i;
- memset(&s, 0, sizeof(s));
- pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
- "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
- atomic_read(&sh->count), sh->pd_idx, sh->check_state,
- sh->reconstruct_state);
+ memset(s, 0, sizeof(*s));
- spin_lock(&sh->lock);
- clear_bit(STRIPE_HANDLE, &sh->state);
- clear_bit(STRIPE_DELAYED, &sh->state);
-
- s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
- s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
- s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+ s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+ s->failed_num[0] = -1;
+ s->failed_num[1] = -1;
/* Now to look around and see what can be done */
rcu_read_lock();
+ spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
+ sector_t first_bad;
+ int bad_sectors;
+ int is_bad = 0;
dev = &sh->dev[i];
- pr_debug("check %d: state 0x%lx toread %p read %p write %p "
- "written %p\n", i, dev->flags, dev->toread, dev->read,
- dev->towrite, dev->written);
-
- /* maybe we can request a biofill operation
+ pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
+ i, dev->flags, dev->toread, dev->towrite, dev->written);
+ /* maybe we can reply to a read
*
* new wantfill requests are only permitted while
* ops_complete_biofill is guaranteed to be inactive
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(R5_Wantfill, &dev->flags);
/* now count some things */
- if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
- if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
- if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
+ if (test_bit(R5_LOCKED, &dev->flags))
+ s->locked++;
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ s->uptodate++;
+ if (test_bit(R5_Wantcompute, &dev->flags)) {
+ s->compute++;
+ BUG_ON(s->compute > 2);
+ }
if (test_bit(R5_Wantfill, &dev->flags))
- s.to_fill++;
+ s->to_fill++;
else if (dev->toread)
- s.to_read++;
+ s->to_read++;
if (dev->towrite) {
- s.to_write++;
+ s->to_write++;
if (!test_bit(R5_OVERWRITE, &dev->flags))
- s.non_overwrite++;
+ s->non_overwrite++;
}
if (dev->written)
- s.written++;
+ s->written++;
rdev = rcu_dereference(conf->disks[i].rdev);
- if (blocked_rdev == NULL &&
- rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- blocked_rdev = rdev;
- atomic_inc(&rdev->nr_pending);
+ if (rdev) {
+ is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors);
+ if (s->blocked_rdev == NULL
+ && (test_bit(Blocked, &rdev->flags)
+ || is_bad < 0)) {
+ if (is_bad < 0)
+ set_bit(BlockedBadBlocks,
+ &rdev->flags);
+ s->blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
+ }
}
clear_bit(R5_Insync, &dev->flags);
if (!rdev)
/* Not in-sync */;
- else if (test_bit(In_sync, &rdev->flags))
+ else if (is_bad) {
+ /* also not in-sync */
+ if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+ /* treat as in-sync, but with a read error
+ * which we can now try to correct
+ */
+ set_bit(R5_Insync, &dev->flags);
+ set_bit(R5_ReadError, &dev->flags);
+ }
+ } else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
else {
- /* could be in-sync depending on recovery/reshape status */
+ /* in sync if before recovery_offset */
if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
set_bit(R5_Insync, &dev->flags);
}
+ if (test_bit(R5_WriteError, &dev->flags)) {
+ clear_bit(R5_Insync, &dev->flags);
+ if (!test_bit(Faulty, &rdev->flags)) {
+ s->handle_bad_blocks = 1;
+ atomic_inc(&rdev->nr_pending);
+ } else
+ clear_bit(R5_WriteError, &dev->flags);
+ }
+ if (test_bit(R5_MadeGood, &dev->flags)) {
+ if (!test_bit(Faulty, &rdev->flags)) {
+ s->handle_bad_blocks = 1;
+ atomic_inc(&rdev->nr_pending);
+ } else
+ clear_bit(R5_MadeGood, &dev->flags);
+ }
if (!test_bit(R5_Insync, &dev->flags)) {
/* The ReadError flag will just be confusing now */
clear_bit(R5_ReadError, &dev->flags);
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh)
if (test_bit(R5_ReadError, &dev->flags))
clear_bit(R5_Insync, &dev->flags);
if (!test_bit(R5_Insync, &dev->flags)) {
- s.failed++;
- s.failed_num = i;
+ if (s->failed < 2)
+ s->failed_num[s->failed] = i;
+ s->failed++;
}
}
+ spin_unlock_irq(&conf->device_lock);
rcu_read_unlock();
-
- if (unlikely(blocked_rdev)) {
- if (s.syncing || s.expanding || s.expanded ||
- s.to_write || s.written) {
- set_bit(STRIPE_HANDLE, &sh->state);
- goto unlock;
- }
- /* There is nothing for the blocked_rdev to block */
- rdev_dec_pending(blocked_rdev, conf->mddev);
- blocked_rdev = NULL;
- }
-
- if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
- set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
- set_bit(STRIPE_BIOFILL_RUN, &sh->state);
- }
-
- pr_debug("locked=%d uptodate=%d to_read=%d"
- " to_write=%d failed=%d failed_num=%d\n",
- s.locked, s.uptodate, s.to_read, s.to_write,
- s.failed, s.failed_num);
- /* check if the array has lost two devices and, if so, some requests might
- * need to be failed
- */
- if (s.failed > 1 && s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks, &return_bi);
- if (s.failed > 1 && s.syncing) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,0);
- clear_bit(STRIPE_SYNCING, &sh->state);
- s.syncing = 0;
- }
-
- /* might be able to return some write requests if the parity block
- * is safe, or on a failed drive
- */
- dev = &sh->dev[sh->pd_idx];
- if ( s.written &&
- ((test_bit(R5_Insync, &dev->flags) &&
- !test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags)) ||
- (s.failed == 1 && s.failed_num == sh->pd_idx)))
- handle_stripe_clean_event(conf, sh, disks, &return_bi);
-
- /* Now we might consider reading some blocks, either to check/generate
- * parity, or to satisfy requests
- * or to load a block that is being partially written.
- */
- if (s.to_read || s.non_overwrite ||
- (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
- handle_stripe_fill5(sh, &s, disks);
-
- /* Now we check to see if any write operations have recently
- * completed
- */
- prexor = 0;
- if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
- prexor = 1;
- if (sh->reconstruct_state == reconstruct_state_drain_result ||
- sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
- sh->reconstruct_state = reconstruct_state_idle;
-
- /* All the 'written' buffers and the parity block are ready to
- * be written back to disk
- */
- BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
- for (i = disks; i--; ) {
- dev = &sh->dev[i];
- if (test_bit(R5_LOCKED, &dev->flags) &&
- (i == sh->pd_idx || dev->written)) {
- pr_debug("Writing block %d\n", i);
- set_bit(R5_Wantwrite, &dev->flags);
- if (prexor)
- continue;
- if (!test_bit(R5_Insync, &dev->flags) ||
- (i == sh->pd_idx && s.failed == 0))
- set_bit(STRIPE_INSYNC, &sh->state);
- }
- }
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- dec_preread_active = 1;
- }
-
- /* Now to consider new write requests and what else, if anything
- * should be read. We do not handle new writes when:
- * 1/ A 'write' operation (copy+xor) is already in flight.
- * 2/ A 'check' operation is in flight, as it may clobber the parity
- * block.
- */
- if (s.to_write && !sh->reconstruct_state && !sh->check_state)
- handle_stripe_dirtying5(conf, sh, &s, disks);
-
- /* maybe we need to check and possibly fix the parity for this stripe
- * Any reads will already have been scheduled, so we just see if enough
- * data is available. The parity check is held off while parity
- * dependent operations are in flight.
- */
- if (sh->check_state ||
- (s.syncing && s.locked == 0 &&
- !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
- !test_bit(STRIPE_INSYNC, &sh->state)))
- handle_parity_checks5(conf, sh, &s, disks);
-
- if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,1);
- clear_bit(STRIPE_SYNCING, &sh->state);
- }
-
- /* If the failed drive is just a ReadError, then we might need to progress
- * the repair/check process
- */
- if (s.failed == 1 && !conf->mddev->ro &&
- test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
- && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
- && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
- ) {
- dev = &sh->dev[s.failed_num];
- if (!test_bit(R5_ReWrite, &dev->flags)) {
- set_bit(R5_Wantwrite, &dev->flags);
- set_bit(R5_ReWrite, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- } else {
- /* let's read it back */
- set_bit(R5_Wantread, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- }
- }
-
- /* Finish reconstruct operations initiated by the expansion process */
- if (sh->reconstruct_state == reconstruct_state_result) {
- struct stripe_head *sh2
- = get_active_stripe(conf, sh->sector, 1, 1, 1);
- if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
- /* sh cannot be written until sh2 has been read.
- * so arrange for sh to be delayed a little
- */
- set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
- &sh2->state))
- atomic_inc(&conf->preread_active_stripes);
- release_stripe(sh2);
- goto unlock;
- }
- if (sh2)
- release_stripe(sh2);
-
- sh->reconstruct_state = reconstruct_state_idle;
- clear_bit(STRIPE_EXPANDING, &sh->state);
- for (i = conf->raid_disks; i--; ) {
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
- set_bit(R5_LOCKED, &sh->dev[i].flags);
- s.locked++;
- }
- }
-
- if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
- !sh->reconstruct_state) {
- /* Need to write out all blocks after computing parity */
- sh->disks = conf->raid_disks;
- stripe_set_idx(sh->sector, conf, 0, sh);
- schedule_reconstruction(sh, &s, 1, 1);
- } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
- clear_bit(STRIPE_EXPAND_READY, &sh->state);
- atomic_dec(&conf->reshape_stripes);
- wake_up(&conf->wait_for_overlap);
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
- }
-
- if (s.expanding && s.locked == 0 &&
- !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
- handle_stripe_expansion(conf, sh, NULL);
-
- unlock:
- spin_unlock(&sh->lock);
-
- /* wait for this device to become unblocked */
- if (unlikely(blocked_rdev))
- md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
-
- if (s.ops_request)
- raid_run_ops(sh, s.ops_request);
-
- ops_run_io(sh, &s);
-
- if (dec_preread_active) {
- /* We delay this until after ops_run_io so that if make_request
- * is waiting on a flush, it won't continue until the writes
- * have actually been submitted.
- */
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) <
- IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- }
- return_io(return_bi);
}
-static void handle_stripe6(struct stripe_head *sh)
+static void handle_stripe(struct stripe_head *sh)
{
+ struct stripe_head_state s;
raid5_conf_t *conf = sh->raid_conf;
+ int i;
+ int prexor;
int disks = sh->disks;
- struct bio *return_bi = NULL;
- int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
- struct stripe_head_state s;
- struct r6_state r6s;
- struct r5dev *dev, *pdev, *qdev;
- mdk_rdev_t *blocked_rdev = NULL;
- int dec_preread_active = 0;
+ struct r5dev *pdev, *qdev;
+
+ clear_bit(STRIPE_HANDLE, &sh->state);
+ if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
+ /* already being handled, ensure it gets handled
+ * again when current action finishes */
+ set_bit(STRIPE_HANDLE, &sh->state);
+ return;
+ }
+
+ if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ set_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ }
+ clear_bit(STRIPE_DELAYED, &sh->state);
pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
(unsigned long long)sh->sector, sh->state,
- atomic_read(&sh->count), pd_idx, qd_idx,
+ atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
sh->check_state, sh->reconstruct_state);
- memset(&s, 0, sizeof(s));
-
- spin_lock(&sh->lock);
- clear_bit(STRIPE_HANDLE, &sh->state);
- clear_bit(STRIPE_DELAYED, &sh->state);
-
- s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
- s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
- s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
- /* Now to look around and see what can be done */
-
- rcu_read_lock();
- for (i=disks; i--; ) {
- mdk_rdev_t *rdev;
- dev = &sh->dev[i];
- pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
- i, dev->flags, dev->toread, dev->towrite, dev->written);
- /* maybe we can reply to a read
- *
- * new wantfill requests are only permitted while
- * ops_complete_biofill is guaranteed to be inactive
- */
- if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
- !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
- set_bit(R5_Wantfill, &dev->flags);
+ analyse_stripe(sh, &s);
- /* now count some things */
- if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
- if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
- if (test_bit(R5_Wantcompute, &dev->flags)) {
- s.compute++;
- BUG_ON(s.compute > 2);
- }
-
- if (test_bit(R5_Wantfill, &dev->flags)) {
- s.to_fill++;
- } else if (dev->toread)
- s.to_read++;
- if (dev->towrite) {
- s.to_write++;
- if (!test_bit(R5_OVERWRITE, &dev->flags))
- s.non_overwrite++;
- }
- if (dev->written)
- s.written++;
- rdev = rcu_dereference(conf->disks[i].rdev);
- if (blocked_rdev == NULL &&
- rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- blocked_rdev = rdev;
- atomic_inc(&rdev->nr_pending);
- }
- clear_bit(R5_Insync, &dev->flags);
- if (!rdev)
- /* Not in-sync */;
- else if (test_bit(In_sync, &rdev->flags))
- set_bit(R5_Insync, &dev->flags);
- else {
- /* in sync if before recovery_offset */
- if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
- set_bit(R5_Insync, &dev->flags);
- }
- if (!test_bit(R5_Insync, &dev->flags)) {
- /* The ReadError flag will just be confusing now */
- clear_bit(R5_ReadError, &dev->flags);
- clear_bit(R5_ReWrite, &dev->flags);
- }
- if (test_bit(R5_ReadError, &dev->flags))
- clear_bit(R5_Insync, &dev->flags);
- if (!test_bit(R5_Insync, &dev->flags)) {
- if (s.failed < 2)
- r6s.failed_num[s.failed] = i;
- s.failed++;
- }
+ if (s.handle_bad_blocks) {
+ set_bit(STRIPE_HANDLE, &sh->state);
+ goto finish;
}
- rcu_read_unlock();
- if (unlikely(blocked_rdev)) {
+ if (unlikely(s.blocked_rdev)) {
if (s.syncing || s.expanding || s.expanded ||
s.to_write || s.written) {
set_bit(STRIPE_HANDLE, &sh->state);
- goto unlock;
+ goto finish;
}
/* There is nothing for the blocked_rdev to block */
- rdev_dec_pending(blocked_rdev, conf->mddev);
- blocked_rdev = NULL;
+ rdev_dec_pending(s.blocked_rdev, conf->mddev);
+ s.blocked_rdev = NULL;
}
if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh)
pr_debug("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
- r6s.failed_num[0], r6s.failed_num[1]);
- /* check if the array has lost >2 devices and, if so, some requests
- * might need to be failed
+ s.failed_num[0], s.failed_num[1]);
+ /* check if the array has lost more than max_degraded devices and,
+ * if so, some requests might need to be failed.
*/
- if (s.failed > 2 && s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks, &return_bi);
- if (s.failed > 2 && s.syncing) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,0);
- clear_bit(STRIPE_SYNCING, &sh->state);
- s.syncing = 0;
- }
+ if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
+ handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+ if (s.failed > conf->max_degraded && s.syncing)
+ handle_failed_sync(conf, sh, &s);
/*
* might be able to return some write requests if the parity blocks
* are safe, or on a failed drive
*/
- pdev = &sh->dev[pd_idx];
- r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
- || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
- qdev = &sh->dev[qd_idx];
- r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
- || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
-
- if ( s.written &&
- ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+ pdev = &sh->dev[sh->pd_idx];
+ s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+ qdev = &sh->dev[sh->qd_idx];
+ s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+ || conf->level < 6;
+
+ if (s.written &&
+ (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
&& test_bit(R5_UPTODATE, &pdev->flags)))) &&
- ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+ (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& test_bit(R5_UPTODATE, &qdev->flags)))))
- handle_stripe_clean_event(conf, sh, disks, &return_bi);
+ handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
- if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
- (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
- handle_stripe_fill6(sh, &s, &r6s, disks);
+ if (s.to_read || s.non_overwrite
+ || (conf->level == 6 && s.to_write && s.failed)
+ || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
+ handle_stripe_fill(sh, &s, disks);
/* Now we check to see if any write operations have recently
* completed
*/
- if (sh->reconstruct_state == reconstruct_state_drain_result) {
-
+ prexor = 0;
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+ prexor = 1;
+ if (sh->reconstruct_state == reconstruct_state_drain_result ||
+ sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
sh->reconstruct_state = reconstruct_state_idle;
- /* All the 'written' buffers and the parity blocks are ready to
+
+ /* All the 'written' buffers and the parity block are ready to
* be written back to disk
*/
BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
- BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
+ BUG_ON(sh->qd_idx >= 0 &&
+ !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
for (i = disks; i--; ) {
- dev = &sh->dev[i];
+ struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_LOCKED, &dev->flags) &&
- (i == sh->pd_idx || i == qd_idx ||
- dev->written)) {
+ (i == sh->pd_idx || i == sh->qd_idx ||
+ dev->written)) {
pr_debug("Writing block %d\n", i);
- BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
set_bit(R5_Wantwrite, &dev->flags);
+ if (prexor)
+ continue;
if (!test_bit(R5_Insync, &dev->flags) ||
- ((i == sh->pd_idx || i == qd_idx) &&
- s.failed == 0))
+ ((i == sh->pd_idx || i == sh->qd_idx) &&
+ s.failed == 0))
set_bit(STRIPE_INSYNC, &sh->state);
}
}
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- dec_preread_active = 1;
+ s.dec_preread_active = 1;
}
/* Now to consider new write requests and what else, if anything
* should be read. We do not handle new writes when:
- * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
+ * 1/ A 'write' operation (copy+xor) is already in flight.
* 2/ A 'check' operation is in flight, as it may clobber the parity
* block.
*/
if (s.to_write && !sh->reconstruct_state && !sh->check_state)
- handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
+ handle_stripe_dirtying(conf, sh, &s, disks);
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
@@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh)
if (sh->check_state ||
(s.syncing && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
- !test_bit(STRIPE_INSYNC, &sh->state)))
- handle_parity_checks6(conf, sh, &s, &r6s, disks);
+ !test_bit(STRIPE_INSYNC, &sh->state))) {
+ if (conf->level == 6)
+ handle_parity_checks6(conf, sh, &s, disks);
+ else
+ handle_parity_checks5(conf, sh, &s, disks);
+ }
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
}
/* If the failed drives are just a ReadError, then we might need
* to progress the repair/check process
*/
- if (s.failed <= 2 && !conf->mddev->ro)
+ if (s.failed <= conf->max_degraded && !conf->mddev->ro)
for (i = 0; i < s.failed; i++) {
- dev = &sh->dev[r6s.failed_num[i]];
+ struct r5dev *dev = &sh->dev[s.failed_num[i]];
if (test_bit(R5_ReadError, &dev->flags)
&& !test_bit(R5_LOCKED, &dev->flags)
&& test_bit(R5_UPTODATE, &dev->flags)
@@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh)
}
}
+
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
+ struct stripe_head *sh_src
+ = get_active_stripe(conf, sh->sector, 1, 1, 1);
+ if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
+ /* sh cannot be written until sh_src has been read.
+ * so arrange for sh to be delayed a little
+ */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+ &sh_src->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe(sh_src);
+ goto finish;
+ }
+ if (sh_src)
+ release_stripe(sh_src);
+
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
for (i = conf->raid_disks; i--; ) {
@@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh)
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
!sh->reconstruct_state) {
- struct stripe_head *sh2
- = get_active_stripe(conf, sh->sector, 1, 1, 1);
- if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
- /* sh cannot be written until sh2 has been read.
- * so arrange for sh to be delayed a little
- */
- set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
- &sh2->state))
- atomic_inc(&conf->preread_active_stripes);
- release_stripe(sh2);
- goto unlock;
- }
- if (sh2)
- release_stripe(sh2);
-
- /* Need to write out all blocks after computing P&Q */
+ /* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
stripe_set_idx(sh->sector, conf, 0, sh);
schedule_reconstruction(sh, &s, 1, 1);
@@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh)
if (s.expanding && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
- handle_stripe_expansion(conf, sh, &r6s);
-
- unlock:
- spin_unlock(&sh->lock);
+ handle_stripe_expansion(conf, sh);
+finish:
/* wait for this device to become unblocked */
- if (unlikely(blocked_rdev))
- md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+ if (unlikely(s.blocked_rdev))
+ md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
+
+ if (s.handle_bad_blocks)
+ for (i = disks; i--; ) {
+ mdk_rdev_t *rdev;
+ struct r5dev *dev = &sh->dev[i];
+ if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
+ /* We own a safe reference to the rdev */
+ rdev = conf->disks[i].rdev;
+ if (!rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
+ md_error(conf->mddev, rdev);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
+ rdev = conf->disks[i].rdev;
+ rdev_clear_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ }
if (s.ops_request)
raid_run_ops(sh, s.ops_request);
ops_run_io(sh, &s);
-
- if (dec_preread_active) {
+ if (s.dec_preread_active) {
/* We delay this until after ops_run_io so that if make_request
* is waiting on a flush, it won't continue until the writes
* have actually been submitted.
@@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread);
}
- return_io(return_bi);
-}
+ return_io(s.return_bi);
-static void handle_stripe(struct stripe_head *sh)
-{
- if (sh->raid_conf->level == 6)
- handle_stripe6(sh);
- else
- handle_stripe5(sh);
+ clear_bit(STRIPE_ACTIVE, &sh->state);
}
static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
rcu_read_lock();
rdev = rcu_dereference(conf->disks[dd_idx].rdev);
if (rdev && test_bit(In_sync, &rdev->flags)) {
+ sector_t first_bad;
+ int bad_sectors;
+
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
raid_bio->bi_next = (void*)rdev;
@@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
align_bi->bi_sector += rdev->data_offset;
- if (!bio_fits_rdev(align_bi)) {
- /* too big in some way */
+ if (!bio_fits_rdev(align_bi) ||
+ is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+ &first_bad, &bad_sectors)) {
+ /* too big in some way, or has a known bad block */
bio_put(align_bi);
rdev_dec_pending(rdev, mddev);
return 0;
@@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
}
}
- if (bio_data_dir(bi) == WRITE &&
+ if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
release_stripe(sh);
@@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+ !add_stripe_bio(sh, bi, dd_idx, rw)) {
/* Stripe is busy expanding or
* add failed due to overlap. Flush everything
* and wait a while
@@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
- spin_lock(&sh->lock);
- set_bit(STRIPE_SYNCING, &sh->state);
- clear_bit(STRIPE_INSYNC, &sh->state);
- spin_unlock(&sh->lock);
+ set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
handle_stripe(sh);
release_stripe(sh);
@@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev)
release_stripe(sh);
cond_resched();
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+ md_check_recovery(mddev);
+
spin_lock_irq(&conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
* isn't possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
number < conf->raid_disks) {
err = -EBUSY;
@@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int first = 0;
int last = conf->raid_disks - 1;
+ if (mddev->recovery_disabled == conf->recovery_disabled)
+ return -EBUSY;
+
if (has_failed(conf))
/* no point adding a device */
return -EINVAL;
@@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev) == 0) {
- char nm[20];
if (rdev->raid_disk
>= conf->previous_raid_disks) {
set_bit(In_sync, &rdev->flags);
added_devices++;
} else
rdev->recovery_offset = 0;
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&mddev->kobj,
- &rdev->kobj, nm))
+
+ if (sysfs_link_rdev(mddev, rdev))
/* Failure here is OK */;
}
} else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
d++) {
mdk_rdev_t *rdev = conf->disks[d].rdev;
if (rdev && raid5_remove_disk(mddev, d) == 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2613b..11b9566184b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -6,11 +6,11 @@
/*
*
- * Each stripe contains one buffer per disc. Each buffer can be in
+ * Each stripe contains one buffer per device. Each buffer can be in
* one of a number of states stored in "flags". Changes between
- * these states happen *almost* exclusively under a per-stripe
- * spinlock. Some very specific changes can happen in bi_end_io, and
- * these are not protected by the spin lock.
+ * these states happen *almost* exclusively under the protection of the
+ * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
+ * these are not protected by STRIPE_ACTIVE.
*
* The flag bits that are used to represent these states are:
* R5_UPTODATE and R5_LOCKED
@@ -76,12 +76,10 @@
* block and the cached buffer are successfully written, any buffer on
* a written list can be returned with b_end_io.
*
- * The write list and read list both act as fifos. The read list is
- * protected by the device_lock. The write and written lists are
- * protected by the stripe lock. The device_lock, which can be
- * claimed while the stipe lock is held, is only for list
- * manipulations and will only be held for a very short time. It can
- * be claimed from interrupts.
+ * The write list and read list both act as fifos. The read list,
+ * write list and written list are protected by the device_lock.
+ * The device_lock is only for list manipulations and will only be
+ * held for a very short time. It can be claimed from interrupts.
*
*
* Stripes in the stripe cache can be on one of two lists (or on
@@ -96,7 +94,6 @@
*
* The inactive_list, handle_list and hash bucket lists are all protected by the
* device_lock.
- * - stripes on the inactive_list never have their stripe_lock held.
* - stripes have a reference counter. If count==0, they are on a list.
* - If a stripe might need handling, STRIPE_HANDLE is set.
* - When refcount reaches zero, then if STRIPE_HANDLE it is put on
@@ -116,10 +113,10 @@
* attach a request to an active stripe (add_stripe_bh())
* lockdev attach-buffer unlockdev
* handle a stripe (handle_stripe())
- * lockstripe clrSTRIPE_HANDLE ...
+ * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
* (lockdev check-buffers unlockdev) ..
* change-state ..
- * record io/ops needed unlockstripe schedule io/ops
+ * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
* release an active stripe (release_stripe())
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
*
@@ -128,8 +125,7 @@
* on a cached buffer, and plus one if the stripe is undergoing stripe
* operations.
*
- * Stripe operations are performed outside the stripe lock,
- * the stripe operations are:
+ * The stripe operations are:
* -copying data between the stripe cache and user application buffers
* -computing blocks to save a disk access, or to recover a missing block
* -updating the parity on a write operation (reconstruct write and
@@ -159,7 +155,8 @@
*/
/*
- * Operations state - intermediate states that are visible outside of sh->lock
+ * Operations state - intermediate states that are visible outside of
+ * STRIPE_ACTIVE.
* In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and
@@ -209,7 +206,6 @@ struct stripe_head {
short ddf_layout;/* use DDF ordering to calculate Q */
unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */
- spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */
enum check_states check_state;
@@ -240,19 +236,20 @@ struct stripe_head {
};
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
- * for handle_stripe. It is only valid under spin_lock(sh->lock);
+ * for handle_stripe.
*/
struct stripe_head_state {
int syncing, expanding, expanded;
int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite;
- int failed_num;
+ int failed_num[2];
+ int p_failed, q_failed;
+ int dec_preread_active;
unsigned long ops_request;
-};
-/* r6_state - extra state data only relevant to r6 */
-struct r6_state {
- int p_failed, q_failed, failed_num[2];
+ struct bio *return_bi;
+ mdk_rdev_t *blocked_rdev;
+ int handle_bad_blocks;
};
/* Flags */
@@ -268,14 +265,16 @@ struct r6_state {
#define R5_ReWrite 9 /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */
-#define R5_Wantcompute 11 /* compute_block in progress treat as
- * uptodate
- */
-#define R5_Wantfill 12 /* dev->toread contains a bio that needs
- * filling
- */
-#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
-#define R5_WantFUA 14 /* Write should be FUA */
+#define R5_Wantcompute 11 /* compute_block in progress treat as
+ * uptodate
+ */
+#define R5_Wantfill 12 /* dev->toread contains a bio that needs
+ * filling
+ */
+#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
+#define R5_WantFUA 14 /* Write should be FUA */
+#define R5_WriteError 15 /* got a write error - need to record it */
+#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
/*
* Write method
*/
@@ -289,21 +288,25 @@ struct r6_state {
/*
* Stripe state
*/
-#define STRIPE_HANDLE 2
-#define STRIPE_SYNCING 3
-#define STRIPE_INSYNC 4
-#define STRIPE_PREREAD_ACTIVE 5
-#define STRIPE_DELAYED 6
-#define STRIPE_DEGRADED 7
-#define STRIPE_BIT_DELAY 8
-#define STRIPE_EXPANDING 9
-#define STRIPE_EXPAND_SOURCE 10
-#define STRIPE_EXPAND_READY 11
-#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
-#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
-#define STRIPE_BIOFILL_RUN 14
-#define STRIPE_COMPUTE_RUN 15
-#define STRIPE_OPS_REQ_PENDING 16
+enum {
+ STRIPE_ACTIVE,
+ STRIPE_HANDLE,
+ STRIPE_SYNC_REQUESTED,
+ STRIPE_SYNCING,
+ STRIPE_INSYNC,
+ STRIPE_PREREAD_ACTIVE,
+ STRIPE_DELAYED,
+ STRIPE_DEGRADED,
+ STRIPE_BIT_DELAY,
+ STRIPE_EXPANDING,
+ STRIPE_EXPAND_SOURCE,
+ STRIPE_EXPAND_READY,
+ STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
+ STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
+ STRIPE_BIOFILL_RUN,
+ STRIPE_COMPUTE_RUN,
+ STRIPE_OPS_REQ_PENDING,
+};
/*
* Operation request flags
@@ -336,7 +339,7 @@ struct r6_state {
* PREREAD_ACTIVE.
* In stripe_handle, if we find pre-reading is necessary, we do it if
* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
- * HANDLE gets cleared if stripe_handle leave nothing locked.
+ * HANDLE gets cleared if stripe_handle leaves nothing locked.
*/
@@ -399,7 +402,7 @@ struct raid5_private_data {
* (fresh device added).
* Cleared when a sync completes.
*/
-
+ int recovery_disabled;
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */