From 0e0497c0c017664994819f4602dc07fd95896c52 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 22 Jun 2009 10:08:02 +0100 Subject: dm mpath: validate table argument count The parser reads the argument count as a number but doesn't check that sufficient arguments are supplied. This command triggers the bug: dmsetup create mpath --table "0 `blockdev --getsize /dev/mapper/cr0` multipath 0 0 2 1 round-robin 1000 0 1 1 /dev/mapper/cr0 round-robin 0 1 1 /dev/mapper/cr1 1000" kernel BUG at drivers/md/dm-mpath.c:530! Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab4f7e..d880299334e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -553,6 +553,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, return -EINVAL; } + if (ps_argc > as->argc) { + dm_put_path_selector(pst); + ti->error = "not enough arguments for path selector"; + return -EINVAL; + } + r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); -- cgit v1.2.3-70-g09d2 From e094f4f15f5169526c7200b9bde44b900548a81e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 22 Jun 2009 10:12:10 +0100 Subject: dm mpath: validate hw_handler argument count Fix arg count parsing error in hw handlers. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d880299334e..f25bdebcb4a 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -705,6 +705,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) if (!hw_argc) return 0; + if (hw_argc > as->argc) { + ti->error = "not enough arguments for hardware handler"; + return -EINVAL; + } + m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); request_module("scsi_dh_%s", m->hw_handler_name); if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { -- cgit v1.2.3-70-g09d2 From a0cf7ea9549ec60988369f90e5c0f855f08abac9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jun 2009 10:12:11 +0100 Subject: dm mpath: change attached scsi_dh When specifying a different hardware handler via multipath features we should be able to override the built-in defaults. The problem here is the hardware table from scsi_dh is compiled in and cannot be changed from userland. The multipath.conf OTOH is purely user-defined and, what's more, the user might have a valid reason for modifying it. (EG EMC Clariion can well be run in PNR mode even though ALUA is active, or the user might want to try ALUA on any as-of-yet unknown devices) So _not_ allowing multipath to override the device handler setting will just add to the confusion and makes error tracking even more difficult. Signed-off-by: Hannes Reinecke Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f25bdebcb4a..545abcc25c4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -597,9 +597,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, } if (m->hw_handler_name) { - r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), - m->hw_handler_name); + struct request_queue *q = bdev_get_queue(p->path.dev->bdev); + + r = scsi_dh_attach(q, m->hw_handler_name); + if (r == -EBUSY) { + /* + * Already attached to different hw_handler, + * try to reattach with correct one. + */ + scsi_dh_detach(q); + r = scsi_dh_attach(q, m->hw_handler_name); + } + if (r < 0) { + ti->error = "error attaching hardware handler"; dm_put_device(ti, p->path.dev); goto bad; } -- cgit v1.2.3-70-g09d2 From e54f77ddda72781ec1c1696b21aabd6a30cbb7c6 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Mon, 22 Jun 2009 10:12:12 +0100 Subject: dm mpath: call activate fn for each path in pg_init Fixed a problem affecting reinstatement of passive paths. Before we moved the hardware handler from dm to SCSI, it performed a pg_init for a path group and didn't maintain any state about each path in hardware handler code. But in SCSI dh, such state is now maintained, as we want to fail I/O early on a path if it is not the active path. All the hardware handlers have a state now and set to active or some form of inactive. They have prep_fn() which uses this state to fail the I/O without it ever being sent to the device. So in effect when dm-multipath calls scsi_dh_activate(), activate is sent to only one path and the "state" of that path is changed appropriately to "active" while other paths in the same path group are never changed as they never got an "activate". In order make sure all the paths in a path group gets their state set properly when a pg_init happens, we need to call scsi_dh_activate() on all paths in a path group. Doing this at the hardware handler layer is not a good option as we want the multipath layer to define the relationship between path and path groups and not the hardware handler. Attached patch sends an "activate" on each path in a path group when a path group is switched. It also sends an activate when a path is reinstated. Signed-off-by: Chandra Seetharaman Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 63 +++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 37 deletions(-) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 545abcc25c4..838f01b1dd3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -35,6 +35,7 @@ struct pgpath { struct dm_path path; struct work_struct deactivate_path; + struct work_struct activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -64,8 +65,6 @@ struct multipath { spinlock_t lock; const char *hw_handler_name; - struct work_struct activate_path; - struct pgpath *pgpath_to_activate; unsigned nr_priority_groups; struct list_head priority_groups; unsigned pg_init_required; /* pg_init needs calling? */ @@ -128,6 +127,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; INIT_WORK(&pgpath->deactivate_path, deactivate_path); + INIT_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -160,7 +160,6 @@ static struct priority_group *alloc_priority_group(void) static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) { - unsigned long flags; struct pgpath *pgpath, *tmp; struct multipath *m = ti->private; @@ -169,10 +168,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) if (m->hw_handler_name) scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); dm_put_device(ti, pgpath->path.dev); - spin_lock_irqsave(&m->lock, flags); - if (m->pgpath_to_activate == pgpath) - m->pgpath_to_activate = NULL; - spin_unlock_irqrestore(&m->lock, flags); free_pgpath(pgpath); } } @@ -202,7 +197,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); - INIT_WORK(&m->activate_path, activate_path); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); @@ -427,8 +421,8 @@ static void process_queued_ios(struct work_struct *work) { struct multipath *m = container_of(work, struct multipath, process_queued_ios); - struct pgpath *pgpath = NULL; - unsigned init_required = 0, must_queue = 1; + struct pgpath *pgpath = NULL, *tmp; + unsigned must_queue = 1; unsigned long flags; spin_lock_irqsave(&m->lock, flags); @@ -446,19 +440,15 @@ static void process_queued_ios(struct work_struct *work) must_queue = 0; if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { - m->pgpath_to_activate = pgpath; m->pg_init_count++; m->pg_init_required = 0; - m->pg_init_in_progress = 1; - init_required = 1; + list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { + if (queue_work(kmpath_handlerd, &tmp->activate_path)) + m->pg_init_in_progress++; + } } - out: spin_unlock_irqrestore(&m->lock, flags); - - if (init_required) - queue_work(kmpath_handlerd, &m->activate_path); - if (!must_queue) dispatch_queued_ios(m); } @@ -946,9 +936,13 @@ static int reinstate_path(struct pgpath *pgpath) pgpath->is_active = 1; - m->current_pgpath = NULL; - if (!m->nr_valid_paths++ && m->queue_size) + if (!m->nr_valid_paths++ && m->queue_size) { + m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); + } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { + if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + m->pg_init_in_progress++; + } dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, pgpath->path.dev->name, m->nr_valid_paths); @@ -1124,35 +1118,30 @@ static void pg_init_done(struct dm_path *path, int errors) spin_lock_irqsave(&m->lock, flags); if (errors) { - DMERR("Could not failover device. Error %d.", errors); - m->current_pgpath = NULL; - m->current_pg = NULL; + if (pgpath == m->current_pgpath) { + DMERR("Could not failover device. Error %d.", errors); + m->current_pgpath = NULL; + m->current_pg = NULL; + } } else if (!m->pg_init_required) { m->queue_io = 0; pg->bypassed = 0; } - m->pg_init_in_progress = 0; - queue_work(kmultipathd, &m->process_queued_ios); + m->pg_init_in_progress--; + if (!m->pg_init_in_progress) + queue_work(kmultipathd, &m->process_queued_ios); spin_unlock_irqrestore(&m->lock, flags); } static void activate_path(struct work_struct *work) { int ret; - struct multipath *m = - container_of(work, struct multipath, activate_path); - struct dm_path *path; - unsigned long flags; + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path); - spin_lock_irqsave(&m->lock, flags); - path = &m->pgpath_to_activate->path; - m->pgpath_to_activate = NULL; - spin_unlock_irqrestore(&m->lock, flags); - if (!path) - return; - ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); - pg_init_done(path, ret); + ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); + pg_init_done(&pgpath->path, ret); } /* -- cgit v1.2.3-70-g09d2 From 53b351f972a882ea8b6cdb19602535f1057c884a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 22 Jun 2009 10:12:13 +0100 Subject: dm mpath: flush keventd queue in destructor The commit fe9cf30eb8186ef267d1868dc9f12f2d0f40835a moves dm table event submission from kmultipath queue to kernel kevent queue to avoid a deadlock. There is a possibility of race condition because kevent queue is not flushed in the multipath destructor. The scenario is: - some event happens and is queued to keventd - keventd thread is delayed due to scheuling latency or some other work - multipath device is destroyed - keventd now attempts to process work_struct that is residing in already released memory. The patch flushes the keventd queue in multipath constructor. I've already fixed similar bug in dm-raid1. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon Cc: stable@kernel.org --- drivers/md/dm-mpath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 838f01b1dd3..92bdcbb7c93 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -848,6 +848,7 @@ static void multipath_dtr(struct dm_target *ti) flush_workqueue(kmpath_handlerd); flush_workqueue(kmultipathd); + flush_scheduled_work(); free_multipath(m); } -- cgit v1.2.3-70-g09d2 From 8627921fa2ef6d40fd9b787e163ba3a9ff8f471d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 22 Jun 2009 10:12:24 +0100 Subject: dm mpath: support barriers Flush support for dm-multipath target. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 92bdcbb7c93..f1cf8f7449d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -835,6 +835,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } + ti->num_flush_requests = 1; + return 0; bad: -- cgit v1.2.3-70-g09d2 From 02ab823fd1a27d193bda06b74fdad685a20a3e5e Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Mon, 22 Jun 2009 10:12:27 +0100 Subject: dm mpath: add start_io and nr_bytes to path selectors This patch makes two additions to the dm path selector interface for dynamic load balancers: o a new hook, start_io() o a new parameter 'nr_bytes' to select_path()/start_io()/end_io() to pass the size of the I/O start_io() is called when a target driver actually submits I/O to the selected path. Path selectors can use it to start accounting of the I/O. (e.g. counting the number of in-flight I/Os.) The start_io hook is based on the patch posted by Stefan Bader: https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html nr_bytes, the size of the I/O, is so path selectors can take the size of the I/O into account when deciding which path to use. dm-service-time uses it to estimate service time, for example. (Added the nr_bytes member to dm_mpath_io instead of using existing details.bi_size, since request-based dm patch deletes it.) Signed-off-by: Stefan Bader Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 28 ++++++++++++++++++---------- drivers/md/dm-path-selector.h | 8 ++++++-- drivers/md/dm-round-robin.c | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f1cf8f7449d..890c0e8ed13 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -101,6 +101,7 @@ struct multipath { struct dm_mpath_io { struct pgpath *pgpath; struct dm_bio_details details; + size_t nr_bytes; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -244,11 +245,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) m->pg_init_count = 0; } -static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, + size_t nr_bytes) { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); + path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); if (!path) return -ENXIO; @@ -260,7 +262,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) return 0; } -static void __choose_pgpath(struct multipath *m) +static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; unsigned bypassed = 1; @@ -272,12 +274,12 @@ static void __choose_pgpath(struct multipath *m) if (m->next_pg) { pg = m->next_pg; m->next_pg = NULL; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } /* Don't change PG until it has no remaining paths */ - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) + if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) return; /* @@ -289,7 +291,7 @@ static void __choose_pgpath(struct multipath *m) list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } } while (bypassed--); @@ -320,6 +322,7 @@ static int map_io(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio, unsigned was_queued) { int r = DM_MAPIO_REMAPPED; + size_t nr_bytes = bio->bi_size; unsigned long flags; struct pgpath *pgpath; @@ -328,7 +331,7 @@ static int map_io(struct multipath *m, struct bio *bio, /* Do we need to select a new pgpath? */ if (!m->current_pgpath || (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) - __choose_pgpath(m); + __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; @@ -353,6 +356,11 @@ static int map_io(struct multipath *m, struct bio *bio, r = -EIO; /* Failed */ mpio->pgpath = pgpath; + mpio->nr_bytes = nr_bytes; + + if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, + nr_bytes); spin_unlock_irqrestore(&m->lock, flags); @@ -431,7 +439,7 @@ static void process_queued_ios(struct work_struct *work) goto out; if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); pgpath = m->current_pgpath; @@ -1209,7 +1217,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } if (r != DM_ENDIO_INCOMPLETE) mempool_free(mpio, m->mpio_pool); @@ -1425,7 +1433,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, spin_lock_irqsave(&m->lock, flags); if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); if (m->current_pgpath) { bdev = m->current_pgpath->path.dev->bdev; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73..e7d1fa8b045 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -56,7 +56,8 @@ struct path_selector_type { * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count); + unsigned *repeat_count, + size_t nr_bytes); /* * Notify the selector that a path has failed. @@ -75,7 +76,10 @@ struct path_selector_type { int (*status) (struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen); - int (*end_io) (struct path_selector *ps, struct dm_path *path); + int (*start_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); + int (*end_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); }; /* Register a path selector */ diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65b28c..24752f449be 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) } static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count) + unsigned *repeat_count, size_t nr_bytes) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = NULL; -- cgit v1.2.3-70-g09d2 From af4874e03ed82f050d5872d8c39ce64bf16b5c38 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 22 Jun 2009 10:12:33 +0100 Subject: dm target:s introduce iterate devices fn Add .iterate_devices to 'struct target_type' to allow a function to be called for all devices in a DM target. Implemented it for all targets except those in dm-snap.c (origin and snapshot). (The raid1 version number jumps to 1.12 because we originally reserved 1.1 to 1.11 for 'block_on_error' but ended up using 'handle_errors' instead.) Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon Cc: martin.petersen@oracle.com --- drivers/md/dm-crypt.c | 11 ++++++++++- drivers/md/dm-delay.c | 20 +++++++++++++++++++- drivers/md/dm-linear.c | 11 ++++++++++- drivers/md/dm-mpath.c | 23 ++++++++++++++++++++++- drivers/md/dm-raid1.c | 17 ++++++++++++++++- drivers/md/dm-stripe.c | 18 +++++++++++++++++- include/linux/device-mapper.h | 11 +++++++++++ 7 files changed, 105 insertions(+), 6 deletions(-) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 04db6c4004a..9933eb861c7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1313,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static int crypt_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct crypt_config *cc = ti->private; + + return fn(ti, cc->dev, cc->start, data); +} + static struct target_type crypt_target = { .name = "crypt", - .version= {1, 6, 0}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, @@ -1326,6 +1334,7 @@ static struct target_type crypt_target = { .resume = crypt_resume, .message = crypt_message, .merge = crypt_merge, + .iterate_devices = crypt_iterate_devices, }; static int __init dm_crypt_init(void) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 8ad8a9044bb..4e5b843cd4d 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -318,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type, return 0; } +static int delay_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct delay_c *dc = ti->private; + int ret = 0; + + ret = fn(ti, dc->dev_read, dc->start_read, data); + if (ret) + goto out; + + if (dc->dev_write) + ret = fn(ti, dc->dev_write, dc->start_write, data); + +out: + return ret; +} + static struct target_type delay_target = { .name = "delay", - .version = {1, 0, 2}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, @@ -328,6 +345,7 @@ static struct target_type delay_target = { .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, + .iterate_devices = delay_iterate_devices, }; static int __init dm_delay_init(void) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ecbb17421da..9184b6deb86 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -134,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static int linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct linear_c *lc = ti->private; + + return fn(ti, lc->dev, lc->start, data); +} + static struct target_type linear_target = { .name = "linear", - .version= {1, 0, 3}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, @@ -144,6 +152,7 @@ static struct target_type linear_target = { .status = linear_status, .ioctl = linear_ioctl, .merge = linear_merge, + .iterate_devices = linear_iterate_devices, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 890c0e8ed13..f8aeaaa54af 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1450,12 +1450,32 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } +static int multipath_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *p; + int ret = 0; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(p, &pg->pgpaths, list) { + ret = fn(ti, p->path.dev, ti->begin, data); + if (ret) + goto out; + } + } + +out: + return ret; +} + /*----------------------------------------------------------------- * Module setup *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 0, 5}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1466,6 +1486,7 @@ static struct target_type multipath_target = { .status = multipath_status, .message = multipath_message, .ioctl = multipath_ioctl, + .iterate_devices = multipath_iterate_devices, }; static int __init dm_multipath_init(void) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 076fbb4e967..ce8868c768c 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type, return 0; } +static int mirror_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct mirror_set *ms = ti->private; + int ret = 0; + unsigned i; + + for (i = 0; !ret && i < ms->nr_mirrors; i++) + ret = fn(ti, ms->mirror[i].dev, + ms->mirror[i].offset, data); + + return ret; +} + static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 20}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, @@ -1295,6 +1309,7 @@ static struct target_type mirror_target = { .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, + .iterate_devices = mirror_iterate_devices, }; static int __init dm_mirror_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c64fe827a5f..b240e85ae39 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -313,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, return error; } +static int stripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct stripe_c *sc = ti->private; + int ret = 0; + unsigned i = 0; + + do + ret = fn(ti, sc->stripe[i].dev, + sc->stripe[i].physical_start, data); + while (!ret && ++i < sc->stripes); + + return ret; +} + static struct target_type stripe_target = { .name = "striped", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, .map = stripe_map, .end_io = stripe_end_io, .status = stripe_status, + .iterate_devices = stripe_iterate_devices, }; int __init dm_stripe_init(void) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 236880c1dc3..deac3b4e5e1 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -11,6 +11,7 @@ #include #include +struct dm_dev; struct dm_target; struct dm_table; struct mapped_device; @@ -81,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd, typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm, struct bio_vec *biovec, int max_size); +typedef int (*iterate_devices_callout_fn) (struct dm_target *ti, + struct dm_dev *dev, + sector_t physical_start, + void *data); + +typedef int (*dm_iterate_devices_fn) (struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data); + /* * Returns: * 0: The target can handle the next I/O immediately. @@ -139,6 +149,7 @@ struct target_type { dm_ioctl_fn ioctl; dm_merge_fn merge; dm_busy_fn busy; + dm_iterate_devices_fn iterate_devices; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3-70-g09d2 From f40c67f0f7e2767f80f7cbcbc1ab86c4113c202e Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Mon, 22 Jun 2009 10:12:37 +0100 Subject: dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Acked-by: Hannes Reinecke Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 193 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 128 insertions(+), 65 deletions(-) (limited to 'drivers/md/dm-mpath.c') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f8aeaaa54af..c70604a2089 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -8,7 +8,6 @@ #include #include "dm-path-selector.h" -#include "dm-bio-record.h" #include "dm-uevent.h" #include @@ -83,7 +82,7 @@ struct multipath { unsigned pg_init_count; /* Number of times pg_init called */ struct work_struct process_queued_ios; - struct bio_list queued_ios; + struct list_head queued_ios; unsigned queue_size; struct work_struct trigger_event; @@ -100,7 +99,6 @@ struct multipath { */ struct dm_mpath_io { struct pgpath *pgpath; - struct dm_bio_details details; size_t nr_bytes; }; @@ -194,6 +192,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { INIT_LIST_HEAD(&m->priority_groups); + INIT_LIST_HEAD(&m->queued_ios); spin_lock_init(&m->lock); m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); @@ -318,13 +317,14 @@ static int __must_push_back(struct multipath *m) dm_noflush_suspending(m->ti)); } -static int map_io(struct multipath *m, struct bio *bio, +static int map_io(struct multipath *m, struct request *clone, struct dm_mpath_io *mpio, unsigned was_queued) { int r = DM_MAPIO_REMAPPED; - size_t nr_bytes = bio->bi_size; + size_t nr_bytes = blk_rq_bytes(clone); unsigned long flags; struct pgpath *pgpath; + struct block_device *bdev; spin_lock_irqsave(&m->lock, flags); @@ -341,16 +341,18 @@ static int map_io(struct multipath *m, struct bio *bio, if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) { /* Queue for the daemon to resubmit */ - bio_list_add(&m->queued_ios, bio); + list_add_tail(&clone->queuelist, &m->queued_ios); m->queue_size++; if ((m->pg_init_required && !m->pg_init_in_progress) || !m->queue_io) queue_work(kmultipathd, &m->process_queued_ios); pgpath = NULL; r = DM_MAPIO_SUBMITTED; - } else if (pgpath) - bio->bi_bdev = pgpath->path.dev->bdev; - else if (__must_push_back(m)) + } else if (pgpath) { + bdev = pgpath->path.dev->bdev; + clone->q = bdev_get_queue(bdev); + clone->rq_disk = bdev->bd_disk; + } else if (__must_push_back(m)) r = DM_MAPIO_REQUEUE; else r = -EIO; /* Failed */ @@ -398,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m) { int r; unsigned long flags; - struct bio *bio = NULL, *next; struct dm_mpath_io *mpio; union map_info *info; + struct request *clone, *n; + LIST_HEAD(cl); spin_lock_irqsave(&m->lock, flags); - bio = bio_list_get(&m->queued_ios); + list_splice_init(&m->queued_ios, &cl); spin_unlock_irqrestore(&m->lock, flags); - while (bio) { - next = bio->bi_next; - bio->bi_next = NULL; + list_for_each_entry_safe(clone, n, &cl, queuelist) { + list_del_init(&clone->queuelist); - info = dm_get_mapinfo(bio); + info = dm_get_rq_mapinfo(clone); mpio = info->ptr; - r = map_io(m, bio, mpio, 1); - if (r < 0) - bio_endio(bio, r); - else if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); - else if (r == DM_MAPIO_REQUEUE) - bio_endio(bio, -EIO); - - bio = next; + r = map_io(m, clone, mpio, 1); + if (r < 0) { + mempool_free(mpio, m->mpio_pool); + dm_kill_unmapped_request(clone, r); + } else if (r == DM_MAPIO_REMAPPED) + dm_dispatch_request(clone); + else if (r == DM_MAPIO_REQUEUE) { + mempool_free(mpio, m->mpio_pool); + dm_requeue_unmapped_request(clone); + } } } @@ -863,21 +866,24 @@ static void multipath_dtr(struct dm_target *ti) } /* - * Map bios, recording original fields for later in case we have to resubmit + * Map cloned requests */ -static int multipath_map(struct dm_target *ti, struct bio *bio, +static int multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context) { int r; struct dm_mpath_io *mpio; struct multipath *m = (struct multipath *) ti->private; - mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); - dm_bio_record(&mpio->details, bio); + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); + if (!mpio) + /* ENOMEM, requeue */ + return DM_MAPIO_REQUEUE; + memset(mpio, 0, sizeof(*mpio)); map_context->ptr = mpio; - bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); - r = map_io(m, bio, mpio, 0); + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + r = map_io(m, clone, mpio, 0); if (r < 0 || r == DM_MAPIO_REQUEUE) mempool_free(mpio, m->mpio_pool); @@ -1158,53 +1164,41 @@ static void activate_path(struct work_struct *work) /* * end_io handling */ -static int do_end_io(struct multipath *m, struct bio *bio, +static int do_end_io(struct multipath *m, struct request *clone, int error, struct dm_mpath_io *mpio) { + /* + * We don't queue any clone request inside the multipath target + * during end I/O handling, since those clone requests don't have + * bio clones. If we queue them inside the multipath target, + * we need to make bio clones, that requires memory allocation. + * (See drivers/md/dm.c:end_clone_bio() about why the clone requests + * don't have bio clones.) + * Instead of queueing the clone request here, we queue the original + * request into dm core, which will remake a clone request and + * clone bios for it and resubmit it later. + */ + int r = DM_ENDIO_REQUEUE; unsigned long flags; - if (!error) + if (!error && !clone->errors) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) - return error; - if (error == -EOPNOTSUPP) return error; - spin_lock_irqsave(&m->lock, flags); - if (!m->nr_valid_paths) { - if (__must_push_back(m)) { - spin_unlock_irqrestore(&m->lock, flags); - return DM_ENDIO_REQUEUE; - } else if (!m->queue_if_no_path) { - spin_unlock_irqrestore(&m->lock, flags); - return -EIO; - } else { - spin_unlock_irqrestore(&m->lock, flags); - goto requeue; - } - } - spin_unlock_irqrestore(&m->lock, flags); - if (mpio->pgpath) fail_path(mpio->pgpath); - requeue: - dm_bio_restore(&mpio->details, bio); - - /* queue for the daemon to resubmit or fail */ spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_ios, bio); - m->queue_size++; - if (!m->queue_io) - queue_work(kmultipathd, &m->process_queued_ios); + if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) + r = -EIO; spin_unlock_irqrestore(&m->lock, flags); - return DM_ENDIO_INCOMPLETE; /* io not complete */ + return r; } -static int multipath_end_io(struct dm_target *ti, struct bio *bio, +static int multipath_end_io(struct dm_target *ti, struct request *clone, int error, union map_info *map_context) { struct multipath *m = ti->private; @@ -1213,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, struct path_selector *ps; int r; - r = do_end_io(m, bio, error, mpio); + r = do_end_io(m, clone, error, mpio); if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - if (r != DM_ENDIO_INCOMPLETE) - mempool_free(mpio, m->mpio_pool); + mempool_free(mpio, m->mpio_pool); return r; } @@ -1470,6 +1463,75 @@ out: return ret; } +static int __pgpath_busy(struct pgpath *pgpath) +{ + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); + + return dm_underlying_device_busy(q); +} + +/* + * We return "busy", only when we can map I/Os but underlying devices + * are busy (so even if we map I/Os now, the I/Os will wait on + * the underlying queue). + * In other words, if we want to kill I/Os or queue them inside us + * due to map unavailability, we don't return "busy". Otherwise, + * dm core won't give us the I/Os and we can't do what we want. + */ +static int multipath_busy(struct dm_target *ti) +{ + int busy = 0, has_active = 0; + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *pgpath; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + /* Guess which priority_group will be used at next mapping time */ + if (unlikely(!m->current_pgpath && m->next_pg)) + pg = m->next_pg; + else if (likely(m->current_pg)) + pg = m->current_pg; + else + /* + * We don't know which pg will be used at next mapping time. + * We don't call __choose_pgpath() here to avoid to trigger + * pg_init just by busy checking. + * So we don't know whether underlying devices we will be using + * at next mapping time are busy or not. Just try mapping. + */ + goto out; + + /* + * If there is one non-busy active path at least, the path selector + * will be able to select it. So we consider such a pg as not busy. + */ + busy = 1; + list_for_each_entry(pgpath, &pg->pgpaths, list) + if (pgpath->is_active) { + has_active = 1; + + if (!__pgpath_busy(pgpath)) { + busy = 0; + break; + } + } + + if (!has_active) + /* + * No active path in this pg, so this pg won't be used and + * the current_pg will be changed at next mapping time. + * We need to try mapping to determine it. + */ + busy = 0; + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return busy; +} + /*----------------------------------------------------------------- * Module setup *---------------------------------------------------------------*/ @@ -1479,14 +1541,15 @@ static struct target_type multipath_target = { .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, - .map = multipath_map, - .end_io = multipath_end_io, + .map_rq = multipath_map, + .rq_end_io = multipath_end_io, .presuspend = multipath_presuspend, .resume = multipath_resume, .status = multipath_status, .message = multipath_message, .ioctl = multipath_ioctl, .iterate_devices = multipath_iterate_devices, + .busy = multipath_busy, }; static int __init dm_multipath_init(void) -- cgit v1.2.3-70-g09d2