diff options
Diffstat (limited to 'fs/ocfs2/cluster/heartbeat.c')
-rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 787 |
1 files changed, 716 insertions, 71 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 41d5f1f92d5..b108e863d8f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -62,10 +62,53 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +/* + * In global heartbeat, we maintain a series of region bitmaps. + * - o2hb_region_bitmap allows us to limit the region number to max region. + * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). + * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes + * heartbeat on it. + * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. + */ +static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; + +#define O2HB_DB_TYPE_LIVENODES 0 +#define O2HB_DB_TYPE_LIVEREGIONS 1 +#define O2HB_DB_TYPE_QUORUMREGIONS 2 +#define O2HB_DB_TYPE_FAILEDREGIONS 3 +#define O2HB_DB_TYPE_REGION_LIVENODES 4 +#define O2HB_DB_TYPE_REGION_NUMBER 5 +#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 +#define O2HB_DB_TYPE_REGION_PINNED 7 +struct o2hb_debug_buf { + int db_type; + int db_size; + int db_len; + void *db_data; +}; + +static struct o2hb_debug_buf *o2hb_db_livenodes; +static struct o2hb_debug_buf *o2hb_db_liveregions; +static struct o2hb_debug_buf *o2hb_db_quorumregions; +static struct o2hb_debug_buf *o2hb_db_failedregions; + #define O2HB_DEBUG_DIR "o2hb" #define O2HB_DEBUG_LIVENODES "livenodes" +#define O2HB_DEBUG_LIVEREGIONS "live_regions" +#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" +#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" +#define O2HB_DEBUG_REGION_NUMBER "num" +#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" +#define O2HB_DEBUG_REGION_PINNED "pinned" + static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; +static struct dentry *o2hb_debug_liveregions; +static struct dentry *o2hb_debug_quorumregions; +static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -77,7 +120,46 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); #define O2HB_DEFAULT_BLOCK_BITS 9 +enum o2hb_heartbeat_modes { + O2HB_HEARTBEAT_LOCAL = 0, + O2HB_HEARTBEAT_GLOBAL, + O2HB_HEARTBEAT_NUM_MODES, +}; + +char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ +}; + unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; +unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; + +/* + * o2hb_dependent_users tracks the number of registered callbacks that depend + * on heartbeat. o2net and o2dlm are two entities that register this callback. + * However only o2dlm depends on the heartbeat. It does not want the heartbeat + * to stop while a dlm domain is still active. + */ +unsigned int o2hb_dependent_users; + +/* + * In global heartbeat mode, all regions are pinned if there are one or more + * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All + * regions are unpinned if the region count exceeds the cut off or the number + * of dependent users falls to zero. + */ +#define O2HB_PIN_CUT_OFF 3 + +/* + * In local heartbeat mode, we assume the dlm domain name to be the same as + * region uuid. This is true for domains created for the file system but not + * necessarily true for userdlm domains. This is a known limitation. + * + * In global heartbeat mode, we pin/unpin all o2hb regions. This solution + * works for both file system and userdlm domains. + */ +static int o2hb_region_pin(const char *region_uuid); +static void o2hb_region_unpin(const char *region_uuid); /* Only sets a new threshold if there are no active regions. * @@ -94,6 +176,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } +static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +{ + int ret = -1; + + if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) { + o2hb_heartbeat_mode = hb_mode; + ret = 0; + } + spin_unlock(&o2hb_live_lock); + } + + return ret; +} + struct o2hb_node_event { struct list_head hn_item; enum o2hb_callback_type hn_event_type; @@ -117,7 +215,9 @@ struct o2hb_region { struct config_item hr_item; struct list_head hr_all_item; - unsigned hr_unclean_stop:1; + unsigned hr_unclean_stop:1, + hr_item_pinned:1, + hr_item_dropped:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -135,6 +235,20 @@ struct o2hb_region { struct block_device *hr_bdev; struct o2hb_disk_slot *hr_slots; + /* live node map of this region */ + unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned int hr_region_num; + + struct dentry *hr_debug_dir; + struct dentry *hr_debug_livenodes; + struct dentry *hr_debug_regnum; + struct dentry *hr_debug_elapsed_time; + struct dentry *hr_debug_pinned; + struct o2hb_debug_buf *hr_db_livenodes; + struct o2hb_debug_buf *hr_db_regnum; + struct o2hb_debug_buf *hr_db_elapsed_time; + struct o2hb_debug_buf *hr_db_pinned; + /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have * a more complete api that doesn't lead to this sort of fragility. */ @@ -163,8 +277,19 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +static int o2hb_pop_count(void *map, int count) +{ + int i = -1, pop = 0; + + while ((i = find_next_bit(map, count, i + 1)) < count) + pop++; + return pop; +} + static void o2hb_write_timeout(struct work_struct *work) { + int failed, quorum; + unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -172,6 +297,28 @@ static void o2hb_write_timeout(struct work_struct *work) mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " "milliseconds\n", reg->hr_dev_name, jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + + if (o2hb_global_heartbeat_active()) { + spin_lock_irqsave(&o2hb_live_lock, flags); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + O2NM_MAX_REGIONS); + quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + + mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", + quorum, failed); + + /* + * Fence if the number of failed regions >= half the number + * of quorum regions + */ + if ((failed << 1) < quorum) + return; + } + o2quo_disk_timeout(); } @@ -180,6 +327,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + spin_unlock(&o2hb_live_lock); + } cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, @@ -188,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) static void o2hb_disarm_write_timeout(struct o2hb_region *reg) { - cancel_delayed_work(®->hr_write_timeout_work); - flush_scheduled_work(); + cancel_delayed_work_sync(®->hr_write_timeout_work); } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -513,6 +664,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event, { assert_spin_locked(&o2hb_live_lock); + BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); + event->hn_event_type = type; event->hn_node = node; event->hn_node_num = node_num; @@ -554,6 +707,43 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2nm_node_put(node); } +static void o2hb_set_quorum_device(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + assert_spin_locked(&o2hb_live_lock); + + if (!o2hb_global_heartbeat_active()) + return; + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + return; + + /* + * A region can be added to the quorum only when it sees all + * live nodes heartbeat on it. In other words, the region has been + * added to all nodes. + */ + if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + sizeof(o2hb_live_node_bitmap))) + return; + + if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) + return; + + printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", + config_item_name(®->hr_item)); + + set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + + /* + * If global heartbeat active, unpin all regions if the + * region count > CUT_OFF + */ + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) + o2hb_region_unpin(NULL); +} + static int o2hb_check_slot(struct o2hb_region *reg, struct o2hb_disk_slot *slot) { @@ -565,14 +755,22 @@ static int o2hb_check_slot(struct o2hb_region *reg, u64 cputime; unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; + int tmp; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); - /* Is this correct? Do we assume that the node doesn't exist - * if we're not configured for him? */ + /* + * If a node is no longer configured but is still in the livemap, we + * may need to clear that bit from the livemap. + */ node = o2nm_get_node_by_num(slot->ds_node_num); - if (!node) - return 0; + if (!node) { + spin_lock(&o2hb_live_lock); + tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); + spin_unlock(&o2hb_live_lock); + if (!tmp) + return 0; + } if (!o2hb_verify_crc(reg, hb_block)) { /* all paths from here will drop o2hb_live_lock for @@ -639,8 +837,12 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", slot->ds_node_num, (long long)slot->ds_last_generation); + set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* first on the list generates a callback */ if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " + "bitmap\n", slot->ds_node_num); set_bit(slot->ds_node_num, o2hb_live_node_bitmap); o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, @@ -684,13 +886,18 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d left my region\n", slot->ds_node_num); + clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* last off the live_slot generates a callback */ list_del_init(&slot->ds_live_item); if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " + "nodes bitmap\n", slot->ds_node_num); clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); - o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, - slot->ds_node_num); + /* node can be null */ + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, + node, slot->ds_node_num); changed = 1; } @@ -706,11 +913,14 @@ fire_callbacks: slot->ds_equal_samples = 0; } out: + o2hb_set_quorum_device(reg, slot); + spin_unlock(&o2hb_live_lock); o2hb_run_event_list(&event); - o2nm_node_put(node); + if (node) + o2nm_node_put(node); return changed; } @@ -737,6 +947,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -746,6 +957,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) return ret; } + /* + * If a node is not configured but is in the livemap, we still need + * to read the slot so as to be able to remove it from the livemap. + */ + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + set_bit(i, configured_nodes); + } + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); @@ -860,6 +1082,9 @@ static int o2hb_thread(void *data) set_user_nice(current, -20); + /* Pin node */ + o2nm_depend_this_node(); + while (!kthread_should_stop() && !reg->hr_unclean_stop) { /* We track the time spent inside * o2hb_do_disk_heartbeat so that we avoid more than @@ -909,6 +1134,9 @@ static int o2hb_thread(void *data) mlog_errno(ret); } + /* Unpin node */ + o2nm_undepend_this_node(); + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); return 0; @@ -917,21 +1145,65 @@ static int o2hb_thread(void *data) #ifdef CONFIG_DEBUG_FS static int o2hb_debug_open(struct inode *inode, struct file *file) { + struct o2hb_debug_buf *db = inode->i_private; + struct o2hb_region *reg; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; char *buf = NULL; int i = -1; int out = 0; + /* max_nodes should be the largest bitmap we pass here */ + BUG_ON(sizeof(map) < db->db_size); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) goto bail; - o2hb_fill_node_map(map, sizeof(map)); + switch (db->db_type) { + case O2HB_DB_TYPE_LIVENODES: + case O2HB_DB_TYPE_LIVEREGIONS: + case O2HB_DB_TYPE_QUORUMREGIONS: + case O2HB_DB_TYPE_FAILEDREGIONS: + spin_lock(&o2hb_live_lock); + memcpy(map, db->db_data, db->db_size); + spin_unlock(&o2hb_live_lock); + break; - while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + case O2HB_DB_TYPE_REGION_LIVENODES: + spin_lock(&o2hb_live_lock); + reg = (struct o2hb_region *)db->db_data; + memcpy(map, reg->hr_live_node_bitmap, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_NUMBER: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + reg->hr_region_num); + goto done; + + case O2HB_DB_TYPE_REGION_ELAPSED_TIME: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + jiffies_to_msecs(jiffies - + reg->hr_last_timeout_start)); + goto done; + + case O2HB_DB_TYPE_REGION_PINNED: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + !!reg->hr_item_pinned); + goto done; + + default: + goto done; + } + + while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); out += snprintf(buf + out, PAGE_SIZE - out, "\n"); +done: i_size_write(inode, out); file->private_data = buf; @@ -978,10 +1250,104 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - if (o2hb_debug_livenodes) - debugfs_remove(o2hb_debug_livenodes); - if (o2hb_debug_dir) - debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); + debugfs_remove(o2hb_debug_failedregions); + debugfs_remove(o2hb_debug_quorumregions); + debugfs_remove(o2hb_debug_liveregions); + debugfs_remove(o2hb_debug_livenodes); + debugfs_remove(o2hb_debug_dir); +} + +static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, + int type, int size, int len, void *data) +{ + *db = kmalloc(db_len, GFP_KERNEL); + if (!*db) + return NULL; + + (*db)->db_type = type; + (*db)->db_size = size; + (*db)->db_len = len; + (*db)->db_data = data; + + return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, + &o2hb_debug_fops); +} + +static int o2hb_debug_init(void) +{ + int ret = -ENOMEM; + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, + o2hb_debug_dir, + &o2hb_db_livenodes, + sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, + sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, + o2hb_live_node_bitmap); + if (!o2hb_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, + o2hb_debug_dir, + &o2hb_db_liveregions, + sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); + if (!o2hb_debug_liveregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_quorumregions = + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, + o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); + if (!o2hb_debug_quorumregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_failedregions = + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, + o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); + if (!o2hb_debug_failedregions) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + if (ret) + o2hb_exit(); + + return ret; } int o2hb_init(void) @@ -997,24 +1363,14 @@ int o2hb_init(void) INIT_LIST_HEAD(&o2hb_node_events); memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); + memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); + memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); + memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } + o2hb_dependent_users = 0; - o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, - S_IFREG|S_IRUSR, - o2hb_debug_dir, NULL, - &o2hb_debug_fops); - if (!o2hb_debug_livenodes) { - mlog_errno(-ENOMEM); - debugfs_remove(o2hb_debug_dir); - return -ENOMEM; - } - - return 0; + return o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ @@ -1078,6 +1434,14 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_slots) kfree(reg->hr_slots); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_livenodes); + debugfs_remove(reg->hr_debug_livenodes); + debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_elapsed_time); + debugfs_remove(reg->hr_debug_pinned); + debugfs_remove(reg->hr_debug_dir); + spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); @@ -1365,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out; reg->hr_bdev = I_BDEV(filp->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); if (ret) { reg->hr_bdev = NULL; goto out; @@ -1441,6 +1805,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; + if (o2hb_global_heartbeat_active()) + set_bit(reg->hr_region_num, o2hb_live_region_bitmap); spin_unlock(&o2hb_live_lock); if (hb_task) @@ -1448,6 +1814,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, else ret = -EIO; + if (hb_task && o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", + config_item_name(®->hr_item)); + out: if (filp) fput(filp); @@ -1586,22 +1956,113 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } +static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +{ + int ret = -ENOMEM; + + reg->hr_debug_dir = + debugfs_create_dir(config_item_name(®->hr_item), dir); + if (!reg->hr_debug_dir) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_livenodes = + o2hb_debug_create(O2HB_DEBUG_LIVENODES, + reg->hr_debug_dir, + &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), + O2NM_MAX_NODES, reg); + if (!reg->hr_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_regnum = + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, + reg->hr_debug_dir, + &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, + 0, O2NM_MAX_NODES, reg); + if (!reg->hr_debug_regnum) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_elapsed_time = + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, + reg->hr_debug_dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, + 0, 0, reg); + if (!reg->hr_debug_elapsed_time) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_pinned = + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, + reg->hr_debug_dir, + &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, + 0, 0, reg); + if (!reg->hr_debug_pinned) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + return ret; +} + static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, const char *name) { struct o2hb_region *reg = NULL; + int ret; reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); if (reg == NULL) return ERR_PTR(-ENOMEM); - config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) { + ret = -ENAMETOOLONG; + goto free; + } spin_lock(&o2hb_live_lock); + reg->hr_region_num = 0; + if (o2hb_global_heartbeat_active()) { + reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap, + O2NM_MAX_REGIONS); + if (reg->hr_region_num >= O2NM_MAX_REGIONS) { + spin_unlock(&o2hb_live_lock); + ret = -EFBIG; + goto free; + } + set_bit(reg->hr_region_num, o2hb_region_bitmap); + } list_add_tail(®->hr_all_item, &o2hb_all_regions); spin_unlock(&o2hb_live_lock); + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); + if (ret) { + config_item_put(®->hr_item); + goto free; + } + return ®->hr_item; +free: + kfree(reg); + return ERR_PTR(ret); } static void o2hb_heartbeat_group_drop_item(struct config_group *group, @@ -1609,11 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, { struct task_struct *hb_task; struct o2hb_region *reg = to_o2hb_region(item); + int quorum_region = 0; /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); + if (o2hb_global_heartbeat_active()) { + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + quorum_region = 1; + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + } hb_task = reg->hr_task; reg->hr_task = NULL; + reg->hr_item_dropped = 1; spin_unlock(&o2hb_live_lock); if (hb_task) @@ -1628,7 +2098,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, wake_up(&o2hb_steady_queue); } + if (o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", + config_item_name(®->hr_item)); + config_item_put(item); + + if (!o2hb_global_heartbeat_active() || !quorum_region) + return; + + /* + * If global heartbeat active and there are dependent users, + * pin all regions if quorum region count <= CUT_OFF + */ + spin_lock(&o2hb_live_lock); + + if (!o2hb_dependent_users) + goto unlock; + + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); } struct o2hb_heartbeat_group_attribute { @@ -1688,6 +2181,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } +static +ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%s\n", + o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); +} + +static +ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, + const char *page, size_t count) +{ + unsigned int i; + int ret; + size_t len; + + len = (page[count - 1] == '\n') ? count - 1 : count; + if (!len) + return -EINVAL; + + for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { + if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + continue; + + ret = o2hb_global_hearbeat_mode_set(i); + if (!ret) + printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", + o2hb_heartbeat_mode_desc[i]); + return count; + } + + return -EINVAL; + +} + static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "dead_threshold", @@ -1696,8 +2224,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold .store = o2hb_heartbeat_group_threshold_store, }; +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "mode", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_mode_show, + .store = o2hb_heartbeat_group_mode_store, +}; + static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { &o2hb_heartbeat_group_attr_threshold.attr, + &o2hb_heartbeat_group_attr_mode.attr, NULL, }; @@ -1770,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, } EXPORT_SYMBOL_GPL(o2hb_setup_callback); -static struct o2hb_region *o2hb_find_region(const char *region_uuid) +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only pin the matching region. In global we pin all the active + * regions. + */ +static int o2hb_region_pin(const char *region_uuid) { - struct o2hb_region *p, *reg = NULL; + int ret = 0, found = 0; + struct o2hb_region *reg; + char *uuid; assert_spin_locked(&o2hb_live_lock); - list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { - if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { - reg = p; - break; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + uuid = config_item_name(®->hr_item); + + /* local heartbeat */ + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } + + if (reg->hr_item_pinned || reg->hr_item_dropped) + goto skip_pin; + + /* Ignore ENOENT only for local hb (userdlm domain) */ + ret = o2nm_depend_item(®->hr_item); + if (!ret) { + mlog(ML_CLUSTER, "Pin region %s\n", uuid); + reg->hr_item_pinned = 1; + } else { + if (ret == -ENOENT && found) + ret = 0; + else { + mlog(ML_ERROR, "Pin region %s fails with %d\n", + uuid, ret); + break; + } } +skip_pin: + if (found) + break; } - return reg; + return ret; } -static int o2hb_region_get(const char *region_uuid) +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only unpin the matching region. In global we unpin all the + * active regions. + */ +static void o2hb_region_unpin(const char *region_uuid) { - int ret = 0; struct o2hb_region *reg; + char *uuid; + int found = 0; - spin_lock(&o2hb_live_lock); + assert_spin_locked(&o2hb_live_lock); - reg = o2hb_find_region(region_uuid); - if (!reg) - ret = -ENOENT; - spin_unlock(&o2hb_live_lock); + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + uuid = config_item_name(®->hr_item); + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } - if (ret) - goto out; + if (reg->hr_item_pinned) { + mlog(ML_CLUSTER, "Unpin region %s\n", uuid); + o2nm_undepend_item(®->hr_item); + reg->hr_item_pinned = 0; + } + if (found) + break; + } +} - ret = o2nm_depend_this_node(); - if (ret) - goto out; +static int o2hb_region_inc_user(const char *region_uuid) +{ + int ret = 0; - ret = o2nm_depend_item(®->hr_item); - if (ret) - o2nm_undepend_this_node(); + spin_lock(&o2hb_live_lock); -out: + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + ret = o2hb_region_pin(region_uuid); + goto unlock; + } + + /* + * if global heartbeat active and this is the first dependent user, + * pin all regions if quorum region count <= CUT_OFF + */ + o2hb_dependent_users++; + if (o2hb_dependent_users > 1) + goto unlock; + + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + ret = o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); return ret; } -static void o2hb_region_put(const char *region_uuid) +void o2hb_region_dec_user(const char *region_uuid) { - struct o2hb_region *reg; - spin_lock(&o2hb_live_lock); - reg = o2hb_find_region(region_uuid); + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + o2hb_region_unpin(region_uuid); + goto unlock; + } - spin_unlock(&o2hb_live_lock); + /* + * if global heartbeat active and there are no dependent users, + * unpin all quorum regions + */ + o2hb_dependent_users--; + if (!o2hb_dependent_users) + o2hb_region_unpin(NULL); - if (reg) { - o2nm_undepend_item(®->hr_item); - o2nm_undepend_this_node(); - } +unlock: + spin_unlock(&o2hb_live_lock); } int o2hb_register_callback(const char *region_uuid, @@ -1847,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid, } if (region_uuid) { - ret = o2hb_region_get(region_uuid); - if (ret) + ret = o2hb_region_inc_user(region_uuid); + if (ret) { + mlog_errno(ret); goto out; + } } down_write(&o2hb_callback_sem); @@ -1867,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid, up_write(&o2hb_callback_sem); ret = 0; out: - mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", + mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n", ret, __builtin_return_address(0), hc); return ret; } @@ -1878,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid, { BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); - mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", + mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n", __builtin_return_address(0), hc); /* XXX Can this happen _with_ a region reference? */ @@ -1886,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid, return; if (region_uuid) - o2hb_region_put(region_uuid); + o2hb_region_dec_user(region_uuid); down_write(&o2hb_callback_sem); @@ -1963,3 +2577,34 @@ void o2hb_stop_all_regions(void) spin_unlock(&o2hb_live_lock); } EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); + +int o2hb_get_all_regions(char *region_uuids, u8 max_regions) +{ + struct o2hb_region *reg; + int numregs = 0; + char *p; + + spin_lock(&o2hb_live_lock); + + p = region_uuids; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); + if (numregs < max_regions) { + memcpy(p, config_item_name(®->hr_item), + O2HB_MAX_REGION_NAME_LEN); + p += O2HB_MAX_REGION_NAME_LEN; + } + numregs++; + } + + spin_unlock(&o2hb_live_lock); + + return numregs; +} +EXPORT_SYMBOL_GPL(o2hb_get_all_regions); + +int o2hb_global_heartbeat_active(void) +{ + return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); +} +EXPORT_SYMBOL(o2hb_global_heartbeat_active); |