diff options
Diffstat (limited to 'fs/ocfs2')
37 files changed, 1949 insertions, 679 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index edc91ca3792..f27e5378caf 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1959,7 +1959,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, goto bail; } - *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); + *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); if (!(*tc)) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2f7268e8152..93628b02ef5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -540,8 +540,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; - u64 vbo_max; /* file offset, max_blocks from iblock */ - u64 p_blkno; + u64 p_blkno, inode_blocks; int contig_blocks; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -550,12 +549,23 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, * nicely aligned and of the right size, so there's no need * for us to check any of that. */ - vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; - spin_lock(&OCFS2_I(inode)->ip_lock); - if ((iblock + max_blocks) > - ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters)) { + inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters); + + /* + * For a read which begins past the end of file, we return a hole. + */ + if (!create && (iblock >= inode_blocks)) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + ret = 0; + goto bail; + } + + /* + * Any write past EOF is not allowed because we'd be extending. + */ + if (create && (iblock + max_blocks) > inode_blocks) { spin_unlock(&OCFS2_I(inode)->ip_lock); ret = -EIO; goto bail; @@ -595,7 +605,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, ssize_t bytes, void *private) { - struct inode *inode = iocb->ki_filp->f_dentry->d_inode; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -611,7 +621,7 @@ static ssize_t ocfs2_direct_IO(int rw, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; int ret; mlog_entry_void(); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4cd9a958045..5a9779bb923 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg) flush_scheduled_work(); } -static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, - unsigned int num_ios) +static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) { - atomic_set(&wc->wc_num_reqs, num_ios); + atomic_set(&wc->wc_num_reqs, 1); init_completion(&wc->wc_io_complete); wc->wc_error = 0; } @@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; blk_run_address_space(mapping); + o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); } @@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio, return 1; o2hb_bio_wait_dec(wc, 1); + bio_put(bio); return 0; } @@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio, * start_slot. */ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, - unsigned int start_slot, - unsigned int num_slots) + unsigned int *current_slot, + unsigned int max_slots) { - int i, nr_vecs, len, first_page, last_page; + int len, current_page; unsigned int vec_len, vec_start; unsigned int bits = reg->hr_block_bits; unsigned int spp = reg->hr_slots_per_page; + unsigned int cs = *current_slot; struct bio *bio; struct page *page; - nr_vecs = (num_slots + spp - 1) / spp; - /* Testing has shown this allocation to take long enough under * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, nr_vecs); + bio = bio_alloc(GFP_ATOMIC, 16); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); + bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - first_page = start_slot / spp; - last_page = first_page + nr_vecs; - vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; - for(i = first_page; i < last_page; i++) { - page = reg->hr_slot_data[i]; + vec_start = (cs << bits) % PAGE_CACHE_SIZE; + while(cs < max_slots) { + current_page = cs / spp; + page = reg->hr_slot_data[current_page]; - vec_len = PAGE_CACHE_SIZE; - /* last page might be short */ - if (((i + 1) * spp) > (start_slot + num_slots)) - vec_len = ((num_slots + start_slot) % spp) << bits; - vec_len -= vec_start; + vec_len = min(PAGE_CACHE_SIZE, + (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", - i, vec_len, vec_start); + current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { - bio_put(bio); - bio = ERR_PTR(-EIO); - - mlog(ML_ERROR, "Error adding page to bio i = %d, " - "vec_len = %u, len = %d\n, start = %u\n", - i, vec_len, len, vec_start); - goto bail; - } + if (len != vec_len) break; + cs += vec_len / (PAGE_CACHE_SIZE/spp); vec_start = 0; } bail: + *current_slot = cs; return bio; } -/* - * Compute the maximum number of sectors the bdev can handle in one bio, - * as a power of two. - * - * Stolen from oracleasm, thanks Joel! - */ -static int compute_max_sectors(struct block_device *bdev) -{ - int max_pages, max_sectors, pow_two_sectors; - - struct request_queue *q; - - q = bdev_get_queue(bdev); - max_pages = q->max_sectors >> (PAGE_SHIFT - 9); - if (max_pages > BIO_MAX_PAGES) - max_pages = BIO_MAX_PAGES; - if (max_pages > q->max_phys_segments) - max_pages = q->max_phys_segments; - if (max_pages > q->max_hw_segments) - max_pages = q->max_hw_segments; - max_pages--; /* Handle I/Os that straddle a page */ - - if (max_pages) { - max_sectors = max_pages << (PAGE_SHIFT - 9); - } else { - /* If BIO contains 1 or less than 1 page. */ - max_sectors = q->max_sectors; - } - /* Why is fls() 1-based???? */ - pow_two_sectors = 1 << (fls(max_sectors) - 1); - - return pow_two_sectors; -} - -static inline void o2hb_compute_request_limits(struct o2hb_region *reg, - unsigned int num_slots, - unsigned int *num_bios, - unsigned int *slots_per_bio) -{ - unsigned int max_sectors, io_sectors; - - max_sectors = compute_max_sectors(reg->hr_bdev); - - io_sectors = num_slots << (reg->hr_block_bits - 9); - - *num_bios = (io_sectors + max_sectors - 1) / max_sectors; - *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); - - mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " - "device can handle %u sectors of I/O\n", io_sectors, num_slots, - max_sectors); - mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", - *num_bios, *slots_per_bio); -} - static int o2hb_read_slots(struct o2hb_region *reg, unsigned int max_slots) { - unsigned int num_bios, slots_per_bio, start_slot, num_slots; - int i, status; + unsigned int current_slot=0; + int status; struct o2hb_bio_wait_ctxt wc; - struct bio **bios; struct bio *bio; - o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); - - bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); - if (!bios) { - status = -ENOMEM; - mlog_errno(status); - return status; - } + o2hb_bio_wait_init(&wc); - o2hb_bio_wait_init(&wc, num_bios); - - num_slots = slots_per_bio; - for(i = 0; i < num_bios; i++) { - start_slot = i * slots_per_bio; - - /* adjust num_slots at last bio */ - if (max_slots < (start_slot + num_slots)) - num_slots = max_slots - start_slot; - - bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); + while(current_slot < max_slots) { + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); if (IS_ERR(bio)) { - o2hb_bio_wait_dec(&wc, num_bios - i); - status = PTR_ERR(bio); mlog_errno(status); goto bail_and_wait; } - bios[i] = bio; + atomic_inc(&wc.wc_num_reqs); submit_bio(READ, bio); } @@ -403,38 +319,30 @@ bail_and_wait: if (wc.wc_error && !status) status = wc.wc_error; - if (bios) { - for(i = 0; i < num_bios; i++) - if (bios[i]) - bio_put(bios[i]); - kfree(bios); - } - return status; } static int o2hb_issue_node_write(struct o2hb_region *reg, - struct bio **write_bio, struct o2hb_bio_wait_ctxt *write_wc) { int status; unsigned int slot; struct bio *bio; - o2hb_bio_wait_init(write_wc, 1); + o2hb_bio_wait_init(write_wc); slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); goto bail; } + atomic_inc(&write_wc->wc_num_reqs); submit_bio(WRITE, bio); - *write_bio = bio; status = 0; bail: return status; @@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; - struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* And fire off the write. Note that we don't wait on this I/O * until later. */ - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + ret = o2hb_issue_node_write(reg, &write_wc); if (ret < 0) { mlog_errno(ret); return ret; @@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * people we find in our steady state have seen us. */ o2hb_wait_on_io(reg, &write_wc); - bio_put(write_bio); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -943,7 +849,6 @@ static int o2hb_thread(void *data) { int i, ret; struct o2hb_region *reg = data; - struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; struct timeval before_hb, after_hb; unsigned int elapsed_msec; @@ -993,10 +898,9 @@ static int o2hb_thread(void *data) * * XXX: Should we skip this on unclean_stop? */ o2hb_prepare_block(reg, 0); - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) { o2hb_wait_on_io(reg, &write_wc); - bio_put(write_bio); } else { mlog_errno(ret); } @@ -1447,6 +1351,15 @@ out: return ret; } +static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, + char *page) +{ + if (!reg->hr_task) + return 0; + + return sprintf(page, "%u\n", reg->hr_task->pid); +} + struct o2hb_region_attribute { struct configfs_attribute attr; ssize_t (*show)(struct o2hb_region *, char *); @@ -1485,11 +1398,19 @@ static struct o2hb_region_attribute o2hb_region_attr_dev = { .store = o2hb_region_dev_write, }; +static struct o2hb_region_attribute o2hb_region_attr_pid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "pid", + .ca_mode = S_IRUGO | S_IRUSR }, + .show = o2hb_region_pid_read, +}; + static struct configfs_attribute *o2hb_region_attrs[] = { &o2hb_region_attr_block_bytes.attr, &o2hb_region_attr_start_block.attr, &o2hb_region_attr_blocks.attr, &o2hb_region_attr_dev.attr, + &o2hb_region_attr_pid.attr, NULL, }; @@ -1553,7 +1474,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g struct o2hb_region *reg = NULL; struct config_item *ret = NULL; - reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL); + reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); if (reg == NULL) goto out; /* ENOMEM */ @@ -1679,7 +1600,7 @@ struct config_group *o2hb_alloc_hb_set(void) struct o2hb_heartbeat_group *hs = NULL; struct config_group *ret = NULL; - hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); + hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); if (hs == NULL) goto out; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index d11753c50bc..b17333a0606 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,7 +35,7 @@ /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling * cluster references throughout where nodes are looked up */ -static struct o2nm_cluster *o2nm_single_cluster = NULL; +struct o2nm_cluster *o2nm_single_cluster = NULL; #define OCFS2_MAX_HB_CTL_PATH 256 static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; @@ -97,17 +97,6 @@ const char *o2nm_get_hb_ctl_path(void) } EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path); -struct o2nm_cluster { - struct config_group cl_group; - unsigned cl_has_local:1; - u8 cl_local_node; - rwlock_t cl_nodes_lock; - struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; - struct rb_root cl_node_ip_tree; - /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ - unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; -}; - struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -543,6 +532,179 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) } #endif +struct o2nm_cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_cluster *, char *); + ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); +}; + +static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, + unsigned int *val) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u32)-1) + return -ERANGE; + + *val = tmp; + + return count; +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_idle_timeout_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change idle timeout after " + "the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val <= cluster->cl_keepalive_delay_ms) { + mlog(ML_NOTICE, "o2net: idle timeout must be larger " + "than keepalive delay\n"); + ret = -EINVAL; + } else { + cluster->cl_idle_timeout_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_keepalive_delay_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change keepalive delay after" + " the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val >= cluster->cl_idle_timeout_ms) { + mlog(ML_NOTICE, "o2net: keepalive delay must be " + "smaller than idle timeout\n"); + ret = -EINVAL; + } else { + cluster->cl_keepalive_delay_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + return o2nm_cluster_attr_write(page, count, + &cluster->cl_reconnect_delay_ms); +} +static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "idle_timeout_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_idle_timeout_ms_read, + .store = o2nm_cluster_attr_idle_timeout_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "keepalive_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_keepalive_delay_ms_read, + .store = o2nm_cluster_attr_keepalive_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "reconnect_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_reconnect_delay_ms_read, + .store = o2nm_cluster_attr_reconnect_delay_ms_write, +}; + +static struct configfs_attribute *o2nm_cluster_attrs[] = { + &o2nm_cluster_attr_idle_timeout_ms.attr, + &o2nm_cluster_attr_keepalive_delay_ms.attr, + &o2nm_cluster_attr_reconnect_delay_ms.attr, + NULL, +}; +static ssize_t o2nm_cluster_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret = 0; + + if (o2nm_cluster_attr->show) + ret = o2nm_cluster_attr->show(cluster, page); + return ret; +} + +static ssize_t o2nm_cluster_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret; + + if (o2nm_cluster_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_cluster_attr->store(cluster, page, count); + if (ret < count) + goto out; +out: + return ret; +} + static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) { @@ -552,7 +714,7 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group, if (strlen(name) > O2NM_MAX_NAME_LEN) goto out; /* ENAMETOOLONG */ - node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL); + node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); if (node == NULL) goto out; /* ENOMEM */ @@ -624,10 +786,13 @@ static void o2nm_cluster_release(struct config_item *item) static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, + .show_attribute = o2nm_cluster_show, + .store_attribute = o2nm_cluster_store, }; static struct config_item_type o2nm_cluster_type = { .ct_item_ops = &o2nm_cluster_item_ops, + .ct_attrs = o2nm_cluster_attrs, .ct_owner = THIS_MODULE, }; @@ -660,8 +825,8 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g if (o2nm_single_cluster) goto out; /* ENOSPC */ - cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL); - ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL); + cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); + ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) @@ -678,6 +843,9 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; + cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; + cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; + cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index fce8033c310..8fb23cacc2f 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -53,6 +53,23 @@ struct o2nm_node { unsigned long nd_set_attributes; }; +struct o2nm_cluster { + struct config_group cl_group; + unsigned cl_has_local:1; + u8 cl_local_node; + rwlock_t cl_nodes_lock; + struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; + struct rb_root cl_node_ip_tree; + unsigned int cl_idle_timeout_ms; + unsigned int cl_keepalive_delay_ms; + unsigned int cl_reconnect_delay_ms; + + /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ + unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +extern struct o2nm_cluster *o2nm_single_cluster; + u8 o2nm_this_node(void); int o2nm_configured_node_map(unsigned long *map, unsigned bytes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 9b3209dc0b1..1718215fc01 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -147,6 +147,28 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); + +/* + * FIXME: These should use to_o2nm_cluster_from_node(), but we end up + * losing our parent link to the cluster during shutdown. This can be + * solved by adding a pre-removal callback to configfs, or passing + * around the cluster with the node. -jeffm + */ +static inline int o2net_reconnect_delay(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_reconnect_delay_ms; +} + +static inline int o2net_keepalive_delay(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_keepalive_delay_ms; +} + +static inline int o2net_idle_timeout(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_idle_timeout_ms; +} static inline int o2net_sys_err_to_errno(enum o2net_system_error err) { @@ -271,6 +293,8 @@ static void sc_kref_release(struct kref *kref) { struct o2net_sock_container *sc = container_of(kref, struct o2net_sock_container, sc_kref); + BUG_ON(timer_pending(&sc->sc_idle_timeout)); + sclog(sc, "releasing\n"); if (sc->sc_sock) { @@ -300,7 +324,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) struct page *page = NULL; page = alloc_page(GFP_NOFS); - sc = kcalloc(1, sizeof(*sc), GFP_NOFS); + sc = kzalloc(sizeof(*sc), GFP_NOFS); if (sc == NULL || page == NULL) goto out; @@ -356,6 +380,13 @@ static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, sc_put(sc); } +static atomic_t o2net_connected_peers = ATOMIC_INIT(0); + +int o2net_num_connected_peers(void) +{ + return atomic_read(&o2net_connected_peers); +} + static void o2net_set_nn_state(struct o2net_node *nn, struct o2net_sock_container *sc, unsigned valid, int err) @@ -366,6 +397,11 @@ static void o2net_set_nn_state(struct o2net_node *nn, assert_spin_locked(&nn->nn_lock); + if (old_sc && !sc) + atomic_dec(&o2net_connected_peers); + else if (!old_sc && sc) + atomic_inc(&o2net_connected_peers); + /* the node num comparison and single connect/accept path should stop * an non-null sc from being overwritten with another */ BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); @@ -424,9 +460,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, /* delay if we're withing a RECONNECT_DELAY of the * last attempt */ delay = (nn->nn_last_connect_attempt + - msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) - jiffies; - if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + if (delay > msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) delay = 0; mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); @@ -520,6 +556,8 @@ static void o2net_register_callbacks(struct sock *sk, sk->sk_data_ready = o2net_data_ready; sk->sk_state_change = o2net_state_change; + mutex_init(&sc->sc_send_lock); + write_unlock_bh(&sk->sk_callback_lock); } @@ -652,6 +690,7 @@ static void o2net_handler_put(struct o2net_msg_handler *nmh) * be given to the handler if their payload is longer than the max. */ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, struct list_head *unreg_list) { struct o2net_msg_handler *nmh = NULL; @@ -678,7 +717,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, goto out; } - nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS); + nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS); if (nmh == NULL) { ret = -ENOMEM; goto out; @@ -686,6 +725,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, nmh->nh_func = func; nmh->nh_func_data = data; + nmh->nh_post_func = post_func; nmh->nh_msg_type = msg_type; nmh->nh_max_len = max_len; nmh->nh_key = key; @@ -820,10 +860,12 @@ static void o2net_sendpage(struct o2net_sock_container *sc, ssize_t ret; + mutex_lock(&sc->sc_send_lock); ret = sc->sc_sock->ops->sendpage(sc->sc_sock, virt_to_page(kmalloced_virt), (long)kmalloced_virt & ~PAGE_MASK, size, MSG_DONTWAIT); + mutex_unlock(&sc->sc_send_lock); if (ret != size) { mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); @@ -938,8 +980,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, /* finally, convert the message header to network byte-order * and send */ + mutex_lock(&sc->sc_send_lock); ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, sizeof(struct o2net_msg) + caller_bytes); + mutex_unlock(&sc->sc_send_lock); msglog(msg, "sending returned %d\n", ret); if (ret < 0) { mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); @@ -1013,6 +1057,7 @@ static int o2net_process_message(struct o2net_sock_container *sc, int ret = 0, handler_status; enum o2net_system_error syserr; struct o2net_msg_handler *nmh = NULL; + void *ret_data = NULL; msglog(hdr, "processing message\n"); @@ -1065,17 +1110,26 @@ static int o2net_process_message(struct o2net_sock_container *sc, sc->sc_msg_type = be16_to_cpu(hdr->msg_type); handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len), - nmh->nh_func_data); + nmh->nh_func_data, &ret_data); do_gettimeofday(&sc->sc_tv_func_stop); out_respond: /* this destroys the hdr, so don't use it after this */ + mutex_lock(&sc->sc_send_lock); ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, handler_status); + mutex_unlock(&sc->sc_send_lock); hdr = NULL; mlog(0, "sending handler status %d, syserr %d returned %d\n", handler_status, syserr, ret); + if (nmh) { + BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); + if (nmh->nh_post_func) + (nmh->nh_post_func)(handler_status, nmh->nh_func_data, + ret_data); + } + out: if (nmh) o2net_handler_put(nmh); @@ -1099,13 +1153,51 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) return -1; } + /* + * Ensure timeouts are consistent with other nodes, otherwise + * we can end up with one node thinking that the other must be down, + * but isn't. This can ultimately cause corruption. + */ + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != + o2net_idle_timeout(sc->sc_node)) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout(sc->sc_node)); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != + o2net_keepalive_delay(sc->sc_node)) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay(sc->sc_node)); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != + O2HB_MAX_WRITE_TIMEOUT_MS) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + sc->sc_handshake_ok = 1; spin_lock(&nn->nn_lock); /* set valid and queue the idle timers only if it hasn't been * shut down already */ if (nn->nn_sc == sc) { - o2net_sc_postpone_idle(sc); + o2net_sc_reset_idle_timer(sc); o2net_set_nn_state(nn, sc, 1, 0); } spin_unlock(&nn->nn_lock); @@ -1131,6 +1223,23 @@ static int o2net_advance_rx(struct o2net_sock_container *sc) sclog(sc, "receiving\n"); do_gettimeofday(&sc->sc_tv_advance_start); + if (unlikely(sc->sc_handshake_ok == 0)) { + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + } + + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { + o2net_check_handshake(sc); + if (unlikely(sc->sc_handshake_ok == 0)) + ret = -EPROTO; + } + goto out; + } + /* do we need more header? */ if (sc->sc_page_off < sizeof(struct o2net_msg)) { data = page_address(sc->sc_page) + sc->sc_page_off; @@ -1138,15 +1247,6 @@ static int o2net_advance_rx(struct o2net_sock_container *sc) ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); if (ret > 0) { sc->sc_page_off += ret; - - /* this working relies on the handshake being - * smaller than the normal message header */ - if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& - !sc->sc_handshake_ok && o2net_check_handshake(sc)) { - ret = -EPROTO; - goto out; - } - /* only swab incoming here.. we can * only get here once as we cross from * being under to over */ @@ -1248,6 +1348,18 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static void o2net_initialize_handshake(void) +{ + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( + o2net_idle_timeout(NULL)); + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( + o2net_keepalive_delay(NULL)); + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( + o2net_reconnect_delay(NULL)); +} + /* ------------------------------------------------------------ */ /* called when a connect completes and after a sock is accepted. the @@ -1262,6 +1374,7 @@ static void o2net_sc_connect_completed(struct work_struct *work) (unsigned long long)O2NET_PROTOCOL_VERSION, (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); + o2net_initialize_handshake(); o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); sc_put(sc); } @@ -1287,8 +1400,10 @@ static void o2net_idle_timer(unsigned long data) do_gettimeofday(&now); - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 " - "seconds, shutting it down.\n", SC_NODEF_ARGS(sc)); + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " + "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), + o2net_idle_timeout(sc->sc_node) / 1000, + o2net_idle_timeout(sc->sc_node) % 1000); mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", @@ -1306,14 +1421,21 @@ static void o2net_idle_timer(unsigned long data) o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } -static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) { o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, - O2NET_KEEPALIVE_DELAY_SECS * HZ); + msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); do_gettimeofday(&sc->sc_tv_timer); mod_timer(&sc->sc_idle_timeout, - jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ)); + jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); +} + +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +{ + /* Only push out an existing timer */ + if (timer_pending(&sc->sc_idle_timeout)) + o2net_sc_reset_idle_timer(sc); } /* this work func is kicked whenever a path sets the nn state which doesn't @@ -1435,9 +1557,12 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { + struct o2nm_node *node = nn->nn_sc->sc_node; mlog(ML_ERROR, "no connection established with node %u after " - "%u seconds, giving up and returning errors.\n", - o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS); + "%u.%u seconds, giving up and returning errors.\n", + o2net_num_from_nn(nn), + o2net_idle_timeout(node) / 1000, + o2net_idle_timeout(node) % 1000); o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); } @@ -1478,6 +1603,8 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, if (node_num != o2nm_this_node()) o2net_disconnect_node(node); + + BUG_ON(atomic_read(&o2net_connected_peers) < 0); } static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, @@ -1489,14 +1616,14 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, /* ensure an immediate connect attempt */ nn->nn_last_connect_attempt = jiffies - - (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1); + (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); if (node_num != o2nm_this_node()) { /* heartbeat doesn't work unless a local node number is * configured and doing so brings up the o2net_wq, so we can * use it.. */ queue_delayed_work(o2net_wq, &nn->nn_connect_expired, - O2NET_IDLE_TIMEOUT_SECS * HZ); + msecs_to_jiffies(o2net_idle_timeout(node))); /* believe it or not, accept and node hearbeating testing * can succeed for this node before we got here.. so @@ -1641,6 +1768,7 @@ static int o2net_accept_one(struct socket *sock) o2net_register_callbacks(sc->sc_sock->sk, sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); + o2net_initialize_handshake(); o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); out: @@ -1685,13 +1813,13 @@ out: ready(sk, bytes); } -static int o2net_open_listening_sock(__be16 port) +static int o2net_open_listening_sock(__be32 addr, __be16 port) { struct socket *sock = NULL; int ret; struct sockaddr_in sin = { .sin_family = PF_INET, - .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, + .sin_addr = { .s_addr = (__force u32)addr }, .sin_port = (__force u16)port, }; @@ -1714,15 +1842,15 @@ static int o2net_open_listening_sock(__be16 port) sock->sk->sk_reuse = 1; ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (ret < 0) { - mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", - ntohs(port), ret); + mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, " + "ret=%d\n", NIPQUAD(addr), ntohs(port), ret); goto out; } ret = sock->ops->listen(sock, 64); if (ret < 0) { - mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", - ntohs(port), ret); + mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n", + NIPQUAD(addr), ntohs(port), ret); } out: @@ -1755,7 +1883,8 @@ int o2net_start_listening(struct o2nm_node *node) return -ENOMEM; /* ? */ } - ret = o2net_open_listening_sock(node->nd_ipv4_port); + ret = o2net_open_listening_sock(node->nd_ipv4_address, + node->nd_ipv4_port); if (ret) { destroy_workqueue(o2net_wq); o2net_wq = NULL; @@ -1808,9 +1937,9 @@ int o2net_init(void) o2quo_init(); - o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL); - o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); - o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); + o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); + o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); + o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { kfree(o2net_hand); kfree(o2net_keep_req); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 616ff2b8434..da880fc215f 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -50,10 +50,20 @@ struct o2net_msg __u8 buf[0]; }; -typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); +typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +typedef void (o2net_post_msg_handler_func)(int status, void *data, + void *ret_data); #define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) +/* same as hb delay, we're waiting for another node to recognize our hb */ +#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 + +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 + + /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) { @@ -92,6 +102,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, struct list_head *unreg_list); void o2net_unregister_handler_list(struct list_head *list); @@ -101,6 +112,7 @@ void o2net_unregister_hb_callbacks(void); int o2net_start_listening(struct o2nm_node *node); void o2net_stop_listening(struct o2nm_node *node); void o2net_disconnect_node(struct o2nm_node *node); +int o2net_num_connected_peers(void); int o2net_init(void); void o2net_exit(void); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index daebbd3a2c8..4dae5df5e46 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -27,23 +27,26 @@ #define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) #define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) -/* same as hb delay, we're waiting for another node to recognize our hb */ -#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS - /* we're delaying our quorum decision so that heartbeat will have timed * out truly dead nodes by the time we come around to making decisions * on their number */ #define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) -#define O2NET_KEEPALIVE_DELAY_SECS 5 -#define O2NET_IDLE_TIMEOUT_SECS 10 - /* * This version number represents quite a lot, unfortunately. It not * only represents the raw network message protocol on the wire but also * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 7: + * - DLM join domain includes the live nodemap + * + * New in version 6: + * - DLM lockres remote refcount fixes. + * + * New in version 5: + * - Network timeout checking protocol + * * New in version 4: * - Remove i_generation from lock names for better stat performance. * @@ -54,10 +57,14 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 4ULL +#define O2NET_PROTOCOL_VERSION 7ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; + __be32 o2hb_heartbeat_timeout_ms; + __be32 o2net_idle_timeout_ms; + __be32 o2net_keepalive_delay_ms; + __be32 o2net_reconnect_delay_ms; }; struct o2net_node { @@ -148,6 +155,8 @@ struct o2net_sock_container { struct timeval sc_tv_func_stop; u32 sc_msg_key; u16 sc_msg_type; + + struct mutex sc_send_lock; }; struct o2net_msg_handler { @@ -157,6 +166,8 @@ struct o2net_msg_handler { u32 nh_key; o2net_msg_handler_func *nh_func; o2net_msg_handler_func *nh_func_data; + o2net_post_msg_handler_func + *nh_post_func; struct kref nh_kref; struct list_head nh_unregister_item; }; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index baad2aa27c1..66821e17816 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -79,7 +79,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; int err; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; int lock_level = 0; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 681046d5139..241cad342a4 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -263,7 +263,8 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, -int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { int ret; unsigned int locklen; @@ -311,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" "name=%.*s\n", past->type, - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name); ret = DLM_IVLOCKID; goto leave; @@ -323,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got %sast for unknown lockres! " "cookie=%u:%llu, name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, locklen); ret = DLM_IVLOCKID; goto leave; @@ -369,7 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, locklen); ret = DLM_NORMAL; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 6b6ff76538c..e90b92f9ece 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -180,6 +180,11 @@ struct dlm_assert_master_priv unsigned ignore_higher:1; }; +struct dlm_deref_lockres_priv +{ + struct dlm_lock_resource *deref_res; + u8 deref_node; +}; struct dlm_work_item { @@ -191,6 +196,7 @@ struct dlm_work_item struct dlm_request_all_locks_priv ral; struct dlm_mig_lockres_priv ml; struct dlm_assert_master_priv am; + struct dlm_deref_lockres_priv dl; } u; }; @@ -222,6 +228,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DIRTY 0x00000008 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +#define DLM_LOCK_RES_DROPPING_REF 0x00000040 +#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 +#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -265,6 +274,8 @@ struct dlm_lock_resource u8 owner; //node which owns the lock resource, or unknown u16 state; char lvb[DLM_LVB_LEN]; + unsigned int inflight_locks; + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; struct dlm_migratable_lock @@ -367,7 +378,7 @@ enum { DLM_CONVERT_LOCK_MSG, /* 504 */ DLM_PROXY_AST_MSG, /* 505 */ DLM_UNLOCK_LOCK_MSG, /* 506 */ - DLM_UNUSED_MSG2, /* 507 */ + DLM_DEREF_LOCKRES_MSG, /* 507 */ DLM_MIGRATE_REQUEST_MSG, /* 508 */ DLM_MIG_LOCKRES_MSG, /* 509 */ DLM_QUERY_JOIN_MSG, /* 510 */ @@ -417,6 +428,9 @@ struct dlm_master_request u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 +#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 + #define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 #define DLM_ASSERT_MASTER_REQUERY 0x00000002 #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 @@ -430,6 +444,8 @@ struct dlm_assert_master u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 + struct dlm_migrate_request { u8 master; @@ -609,12 +625,16 @@ struct dlm_begin_reco }; +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) + struct dlm_query_join_request { u8 node_idx; u8 pad1[2]; u8 name_len; u8 domain[O2NM_MAX_NAME_LEN]; + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; }; struct dlm_assert_joined @@ -648,6 +668,16 @@ struct dlm_finalize_reco __be32 pad2; }; +struct dlm_deref_lockres +{ + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -688,16 +718,20 @@ void dlm_lock_put(struct dlm_lock *lock); void dlm_lock_attach_lockres(struct dlm_lock *lock, struct dlm_lock_resource *res); -int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock); void dlm_revert_pending_lock(struct dlm_lock_resource *res, struct dlm_lock *lock); -int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock); void dlm_commit_pending_unlock(struct dlm_lock_resource *res, @@ -721,8 +755,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -void dlm_purge_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); static inline void dlm_lockres_get(struct dlm_lock_resource *res) { /* This is called on every lookup, so it might be worth @@ -733,6 +765,10 @@ void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len, @@ -753,6 +789,47 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int namelen); +#define dlm_lockres_set_refmap_bit(bit,res) \ + __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) +#define dlm_lockres_clear_refmap_bit(bit,res) \ + __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) + +static inline void __dlm_lockres_set_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: setting bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + set_bit(bit, res->refmap); +} + +static inline void __dlm_lockres_clear_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: clearing bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + clear_bit(bit, res->refmap); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line); +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line); +#define dlm_lockres_drop_inflight_ref(d,r) \ + __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref_new(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, @@ -801,10 +878,7 @@ int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); -int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_migrate_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 target); +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 old_master); @@ -812,15 +886,27 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); -int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_assert_master_post_handler(int status, void *data, void *ret_data); +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master); @@ -856,10 +942,12 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) int dlm_init_mle_cache(void); void dlm_destroy_mle_cache(void); void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); - +int __dlm_lockres_has_locks(struct dlm_lock_resource *res); int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index c764dc8e40a..ecb4d997221 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -286,8 +286,8 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, __dlm_print_one_lock_resource(res); mlog(ML_ERROR, "converting a remote lock that is already " "converting! (cookie=%u:%llu, conv=%d)\n", - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), lock->ml.convert_type); status = DLM_DENIED; goto bail; @@ -418,7 +418,8 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, * status from __dlmconvert_master */ -int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; @@ -428,7 +429,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0, ast_reserved = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -479,25 +480,14 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } lock = NULL; } - if (!lock) { - __dlm_print_one_lock_resource(res); - list_for_each(iter, &res->granted) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.node == cnv->node_idx) { - mlog(ML_ERROR, "There is something here " - "for node %u, lock->ml.cookie=%llu, " - "cnv->cookie=%llu\n", cnv->node_idx, - (unsigned long long)lock->ml.cookie, - (unsigned long long)cnv->cookie); - break; - } - } - lock = NULL; - } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; - dlm_error(status); + mlog(ML_ERROR, "did not find lock to convert on grant queue! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie))); + __dlm_print_one_lock_resource(res); goto leave; } @@ -524,8 +514,11 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) cnv->requested_type, &call_ast, &kick_thread); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + wake = 1; } spin_unlock(&res->spinlock); + if (wake) + wake_up(&res->wq); if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) @@ -534,12 +527,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } leave: - if (!lock) - mlog(ML_ERROR, "did not find lock to convert on grant queue! " - "cookie=%u:%llu\n", - dlm_get_lock_cookie_node(cnv->cookie), - dlm_get_lock_cookie_seq(cnv->cookie)); - else + if (lock) dlm_lock_put(lock); /* either queue the ast or release it, if reserved */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 3f6c8d88f7a..64239b37e5d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -53,6 +53,23 @@ void dlm_print_one_lock_resource(struct dlm_lock_resource *res) spin_unlock(&res->spinlock); } +static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) +{ + int bit; + assert_spin_locked(&res->spinlock); + + mlog(ML_NOTICE, " refmap nodes: [ "); + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + printk("%u ", bit); + bit++; + } + printk("], inflight=%u\n", res->inflight_locks); +} + void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { struct list_head *iter2; @@ -65,6 +82,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) res->owner, res->state); mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", res->last_used, list_empty(&res->purge) ? "no" : "yes"); + dlm_print_lockres_refmap(res); mlog(ML_NOTICE, " granted queue: \n"); list_for_each(iter2, &res->granted) { lock = list_entry(iter2, struct dlm_lock, list); @@ -72,8 +90,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', @@ -87,8 +105,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', @@ -102,8 +120,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 420a375a394..6087c4749fe 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -48,6 +48,36 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +/* + * ocfs2 node maps are array of long int, which limits to send them freely + * across the wire due to endianness issues. To workaround this, we convert + * long ints to byte arrays. Following 3 routines are helper functions to + * set/test/copy bits within those array of bytes + */ +static inline void byte_set_bit(u8 nr, u8 map[]) +{ + map[nr >> 3] |= (1UL << (nr & 7)); +} + +static inline int byte_test_bit(u8 nr, u8 map[]) +{ + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; +} + +static inline void byte_copymap(u8 dmap[], unsigned long smap[], + unsigned int sz) +{ + unsigned int nn; + + if (!sz) + return; + + memset(dmap, 0, ((sz + 7) >> 3)); + for (nn = 0 ; nn < sz; nn++) + if (test_bit(nn, smap)) + byte_set_bit(nn, dmap); +} + static void dlm_free_pagevec(void **vec, int pages) { while (pages--) @@ -95,10 +125,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); #define DLM_DOMAIN_BACKOFF_MS 200 -static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); @@ -125,10 +159,10 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, hlist_add_head(&res->hash_node, bucket); } -struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len, - unsigned int hash) +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) { struct hlist_head *bucket; struct hlist_node *list; @@ -154,6 +188,37 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, return NULL; } +/* intended to be called by functions which do not care about lock + * resources which are being purged (most net _handler functions). + * this will return NULL for any lock resource which is found but + * currently in the process of dropping its mastery reference. + * use __dlm_lookup_lockres_full when you need the lock resource + * regardless (e.g. dlm_get_lock_resource) */ +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct dlm_lock_resource *res = NULL; + + mlog_entry("%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + res = __dlm_lookup_lockres_full(dlm, name, len, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + return NULL; + } + spin_unlock(&res->spinlock); + } + + return res; +} + struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len) @@ -330,43 +395,60 @@ static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) wake_up(&dlm_domain_events); } -static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) +static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) { - int i; + int i, num, n, ret = 0; struct dlm_lock_resource *res; + struct hlist_node *iter; + struct hlist_head *bucket; + int dropped; mlog(0, "Migrating locks from domain %s\n", dlm->name); -restart: + + num = 0; spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(dlm_lockres_hash(dlm, i))) { - res = hlist_entry(dlm_lockres_hash(dlm, i)->first, - struct dlm_lock_resource, hash_node); - /* need reference when manually grabbing lockres */ +redo_bucket: + n = 0; + bucket = dlm_lockres_hash(dlm, i); + iter = bucket->first; + while (iter) { + n++; + res = hlist_entry(iter, struct dlm_lock_resource, + hash_node); dlm_lockres_get(res); - /* this should unhash the lockres - * and exit with dlm->spinlock */ - mlog(0, "purging res=%p\n", res); - if (dlm_lockres_is_dirty(dlm, res)) { - /* HACK! this should absolutely go. - * need to figure out why some empty - * lockreses are still marked dirty */ - mlog(ML_ERROR, "lockres %.*s dirty!\n", - res->lockname.len, res->lockname.name); - - spin_unlock(&dlm->spinlock); - dlm_kick_thread(dlm, res); - wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); - dlm_lockres_put(res); - goto restart; - } - dlm_purge_lockres(dlm, res); + /* migrate, if necessary. this will drop the dlm + * spinlock and retake it if it does migration. */ + dropped = dlm_empty_lockres(dlm, res); + + spin_lock(&res->spinlock); + __dlm_lockres_calc_usage(dlm, res); + iter = res->hash_node.next; + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + cond_resched_lock(&dlm->spinlock); + + if (dropped) + goto redo_bucket; } + num += n; + mlog(0, "%s: touched %d lockreses in bucket %d " + "(tot=%d)\n", dlm->name, n, i, num); } spin_unlock(&dlm->spinlock); - + wake_up(&dlm->dlm_thread_wq); + + /* let the dlm thread take care of purging, keep scanning until + * nothing remains in the hash */ + if (num) { + mlog(0, "%s: %d lock resources in hash last pass\n", + dlm->name, num); + ret = -EAGAIN; + } mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); + return ret; } static int dlm_no_joining_node(struct dlm_ctxt *dlm) @@ -418,7 +500,8 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm) printk("\n"); } -static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; unsigned int node; @@ -571,7 +654,9 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) /* We changed dlm state, notify the thread */ dlm_kick_thread(dlm, NULL); - dlm_migrate_all_locks(dlm); + while (dlm_migrate_all_locks(dlm)) { + mlog(0, "%s: more migration to do\n", dlm->name); + } dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); dlm_complete_dlm_shutdown(dlm); @@ -580,11 +665,13 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) } EXPORT_SYMBOL_GPL(dlm_unregister_domain); -static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_query_join_request *query; enum dlm_query_join_response response; struct dlm_ctxt *dlm = NULL; + u8 nodenum; query = (struct dlm_query_join_request *) msg->buf; @@ -608,6 +695,28 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + if (!dlm) + goto unlock_respond; + + /* + * There is a small window where the joining node may not see the + * node(s) that just left but still part of the cluster. DISALLOW + * join request if joining node has different node map. + */ + nodenum=0; + while (nodenum < O2NM_MAX_NODES) { + if (test_bit(nodenum, dlm->domain_map)) { + if (!byte_test_bit(nodenum, query->node_map)) { + mlog(0, "disallow join as node %u does not " + "have node %u in its nodemap\n", + query->node_idx, nodenum); + response = JOIN_DISALLOW; + goto unlock_respond; + } + } + nodenum++; + } + /* Once the dlm ctxt is marked as leaving then we don't want * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome @@ -626,15 +735,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) /* Disallow parallel joins. */ response = JOIN_DISALLOW; } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { - mlog(ML_NOTICE, "node %u trying to join, but recovery " + mlog(0, "node %u trying to join, but recovery " "is ongoing.\n", bit); response = JOIN_DISALLOW; } else if (test_bit(bit, dlm->recovery_map)) { - mlog(ML_NOTICE, "node %u trying to join, but it " + mlog(0, "node %u trying to join, but it " "still needs recovery.\n", bit); response = JOIN_DISALLOW; } else if (test_bit(bit, dlm->domain_map)) { - mlog(ML_NOTICE, "node %u trying to join, but it " + mlog(0, "node %u trying to join, but it " "is still in the domain! needs recovery?\n", bit); response = JOIN_DISALLOW; @@ -649,6 +758,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_unlock(&dlm->spinlock); } +unlock_respond: spin_unlock(&dlm_domain_lock); respond: @@ -657,7 +767,8 @@ respond: return response; } -static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_assert_joined *assert; struct dlm_ctxt *dlm = NULL; @@ -694,7 +805,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) return 0; } -static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_cancel_join *cancel; struct dlm_ctxt *dlm = NULL; @@ -796,6 +908,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm, join_msg.name_len = strlen(dlm->name); memcpy(join_msg.domain, dlm->name, join_msg.name_len); + /* copy live node map to join message */ + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, sizeof(join_msg), node, &retval); if (status < 0 && status != -ENOPROTOOPT) { @@ -920,7 +1035,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) mlog_entry("%p", dlm); - ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL); + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (!ctxt) { status = -ENOMEM; mlog_errno(status); @@ -1036,98 +1151,106 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, sizeof(struct dlm_master_request), dlm_master_request_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, sizeof(struct dlm_assert_master), dlm_assert_master_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, dlm_assert_master_post_handler, + &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, sizeof(struct dlm_create_lock), dlm_create_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, DLM_CONVERT_LOCK_MAX_LEN, dlm_convert_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, DLM_UNLOCK_LOCK_MAX_LEN, dlm_unlock_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, DLM_PROXY_AST_MAX_LEN, dlm_proxy_ast_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, sizeof(struct dlm_exit_domain), dlm_exit_domain_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, + sizeof(struct dlm_deref_lockres), + dlm_deref_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, sizeof(struct dlm_migrate_request), dlm_migrate_request_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, DLM_MIG_LOCKRES_MAX_LEN, dlm_mig_lockres_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, sizeof(struct dlm_master_requery), dlm_master_requery_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, sizeof(struct dlm_lock_request), dlm_request_all_locks_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, sizeof(struct dlm_reco_data_done), dlm_reco_data_done_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, sizeof(struct dlm_begin_reco), dlm_begin_reco_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, sizeof(struct dlm_finalize_reco), dlm_finalize_reco_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; @@ -1141,6 +1264,8 @@ bail: static int dlm_join_domain(struct dlm_ctxt *dlm) { int status; + unsigned int backoff; + unsigned int total_backoff = 0; BUG_ON(!dlm); @@ -1172,18 +1297,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) } do { - unsigned int backoff; status = dlm_try_to_join_domain(dlm); /* If we're racing another node to the join, then we * need to back off temporarily and let them * complete. */ +#define DLM_JOIN_TIMEOUT_MSECS 90000 if (status == -EAGAIN) { if (signal_pending(current)) { status = -ERESTARTSYS; goto bail; } + if (total_backoff > + msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + status = -ERESTARTSYS; + mlog(ML_NOTICE, "Timed out joining dlm domain " + "%s after %u msecs\n", dlm->name, + jiffies_to_msecs(total_backoff)); + goto bail; + } + /* * <chip> After you! * <dale> No, after you! @@ -1193,6 +1327,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) */ backoff = (unsigned int)(jiffies & 0x3); backoff *= DLM_DOMAIN_BACKOFF_MS; + total_backoff += backoff; mlog(0, "backoff %d\n", backoff); msleep(backoff); } @@ -1223,7 +1358,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, int i; struct dlm_ctxt *dlm = NULL; - dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL); + dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); if (!dlm) { mlog_errno(-ENOMEM); goto leave; @@ -1421,21 +1556,21 @@ static int dlm_register_net_handlers(void) status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, sizeof(struct dlm_query_join_request), dlm_query_join_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); if (status) goto bail; status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, sizeof(struct dlm_assert_joined), dlm_assert_joined_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, sizeof(struct dlm_cancel_join), dlm_cancel_join_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 941acf14e61..b7f0ba97a1a 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -176,7 +176,7 @@ static ssize_t dlmfs_file_read(struct file *filp, int bytes_left; ssize_t readlen; char *lvb_buf; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); @@ -220,7 +220,7 @@ static ssize_t dlmfs_file_write(struct file *filp, int bytes_left; ssize_t writelen; char *lvb_buf; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 42a1b91979b..52578d907d9 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -163,6 +163,10 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, kick_thread = 1; } } + /* reduce the inflight count, this may result in the lockres + * being purged below during calc_usage */ + if (lock->ml.node == dlm->node_num) + dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -408,13 +412,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kcalloc(1, sizeof(*lock), GFP_NOFS); + lock = kzalloc(sizeof(*lock), GFP_NOFS); if (!lock) return NULL; if (!lksb) { /* zero memory only if kernel-allocated */ - lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); if (!lksb) { kfree(lock); return NULL; @@ -437,7 +441,8 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, * held on exit: none * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED */ -int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 856012b4fa4..77e4e6169a0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -99,9 +99,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm, int idx); static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags); +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); static inline int dlm_mle_equal(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -237,7 +238,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry **mle, char *name, unsigned int namelen); -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to); static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, @@ -687,6 +689,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, INIT_LIST_HEAD(&res->purge); atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; + res->inflight_locks = 0; kref_init(&res->refs); @@ -700,6 +703,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; memset(res->lvb, 0, DLM_LVB_LEN); + memset(res->refmap, 0, sizeof(res->refmap)); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -722,6 +726,42 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, return res; } +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line) +{ + if (!new_lockres) + assert_spin_locked(&res->spinlock); + + if (!test_bit(dlm->node_num, res->refmap)) { + BUG_ON(res->inflight_locks != 0); + dlm_lockres_set_refmap_bit(dlm->node_num, res); + } + res->inflight_locks++; + mlog(0, "%s:%.*s: inflight++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + assert_spin_locked(&res->spinlock); + + BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; + mlog(0, "%s:%.*s: inflight--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); + if (res->inflight_locks == 0) + dlm_lockres_clear_refmap_bit(dlm->node_num, res); + wake_up(&res->wq); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -752,6 +792,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; + int drop_inflight_if_nonlocal = 0; BUG_ON(!lockid); @@ -761,9 +802,30 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { + int dropping_ref = 0; + + spin_lock(&tmpres->spinlock); + if (tmpres->owner == dlm->node_num) { + BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); + dlm_lockres_grab_inflight_ref(dlm, tmpres); + } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) + dropping_ref = 1; + spin_unlock(&tmpres->spinlock); spin_unlock(&dlm->spinlock); + + /* wait until done messaging the master, drop our ref to allow + * the lockres to be purged, start over. */ + if (dropping_ref) { + spin_lock(&tmpres->spinlock); + __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + mlog(0, "found in hash!\n"); if (res) dlm_lockres_put(res); @@ -793,6 +855,7 @@ lookup: spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, dlm->node_num); __dlm_insert_lockres(dlm, res); + dlm_lockres_grab_inflight_ref(dlm, res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* lockres still marked IN_PROGRESS */ @@ -805,29 +868,40 @@ lookup: /* if we found a block, wait for lock to be mastered by another node */ blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); if (blocked) { + int mig; if (mle->type == DLM_MLE_MASTER) { mlog(ML_ERROR, "master entry for nonexistent lock!\n"); BUG(); - } else if (mle->type == DLM_MLE_MIGRATION) { - /* migration is in progress! */ - /* the good news is that we now know the - * "current" master (mle->master). */ - + } + mig = (mle->type == DLM_MLE_MIGRATION); + /* if there is a migration in progress, let the migration + * finish before continuing. we can wait for the absence + * of the MIGRATION mle: either the migrate finished or + * one of the nodes died and the mle was cleaned up. + * if there is a BLOCK here, but it already has a master + * set, we are too late. the master does not have a ref + * for us in the refmap. detach the mle and drop it. + * either way, go back to the top and start over. */ + if (mig || mle->master != O2NM_MAX_NODES) { + BUG_ON(mig && mle->master == dlm->node_num); + /* we arrived too late. the master does not + * have a ref for us. retry. */ + mlog(0, "%s:%.*s: late on %s\n", + dlm->name, namelen, lockid, + mig ? "MIGRATION" : "BLOCK"); spin_unlock(&dlm->master_lock); - assert_spin_locked(&dlm->spinlock); - - /* set the lockres owner and hash it */ - spin_lock(&res->spinlock); - dlm_set_lockres_owner(dlm, res, mle->master); - __dlm_insert_lockres(dlm, res); - spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* master is known, detach */ - dlm_mle_detach_hb_events(dlm, mle); + if (!mig) + dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); mle = NULL; - goto wake_waiters; + /* this is lame, but we cant wait on either + * the mle or lockres waitqueue here */ + if (mig) + msleep(100); + goto lookup; } } else { /* go ahead and try to master lock on this node */ @@ -858,6 +932,13 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); + /* since this lockres is new it doesnt not require the spinlock */ + dlm_lockres_grab_inflight_ref_new(dlm, res); + + /* if this node does not become the master make sure to drop + * this inflight reference below */ + drop_inflight_if_nonlocal = 1; + /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we @@ -910,7 +991,7 @@ redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { - ret = dlm_do_master_request(mle, nodenum); + ret = dlm_do_master_request(res, mle, nodenum); if (ret < 0) mlog_errno(ret); if (mle->master != O2NM_MAX_NODES) { @@ -960,6 +1041,8 @@ wait: wake_waiters: spin_lock(&res->spinlock); + if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) + dlm_lockres_drop_inflight_ref(dlm, res); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -998,7 +1081,7 @@ recheck: /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ if (res->owner != dlm->node_num) { - ret = dlm_do_master_request(mle, res->owner); + ret = dlm_do_master_request(res, mle, res->owner); if (ret < 0) { /* give recovery a chance to run */ mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); @@ -1062,6 +1145,8 @@ recheck: * now tell other nodes that I am * mastering this. */ mle->master = dlm->node_num; + /* ref was grabbed in get_lock_resource + * will be dropped in dlmlock_master */ assert = 1; sleep = 0; } @@ -1087,7 +1172,8 @@ recheck: (atomic_read(&mle->woken) == 1), timeo); if (res->owner == O2NM_MAX_NODES) { - mlog(0, "waiting again\n"); + mlog(0, "%s:%.*s: waiting again\n", dlm->name, + res->lockname.len, res->lockname.name); goto recheck; } mlog(0, "done waiting, master is %u\n", res->owner); @@ -1100,8 +1186,7 @@ recheck: m = dlm->node_num; mlog(0, "about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, m); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, mle->vote_map, 0); + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); if (ret) { /* This is a failure in the network path, * not in the response to the assert_master @@ -1117,6 +1202,8 @@ recheck: /* set the lockres owner */ spin_lock(&res->spinlock); + /* mastery reference obtained either during + * assert_master_handler or in get_lock_resource */ dlm_change_lockres_owner(dlm, res, m); spin_unlock(&res->spinlock); @@ -1283,7 +1370,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, * */ -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to) { struct dlm_ctxt *dlm = mle->dlm; struct dlm_master_request request; @@ -1339,6 +1427,9 @@ again: case DLM_MASTER_RESP_YES: set_bit(to, mle->response_map); mlog(0, "node %u is the master, response=YES\n", to); + mlog(0, "%s:%.*s: master node %u now knows I have a " + "reference\n", dlm->name, res->lockname.len, + res->lockname.name, to); mle->master = to; break; case DLM_MASTER_RESP_NO: @@ -1379,7 +1470,8 @@ out: * * if possible, TRIM THIS DOWN!!! */ -int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { u8 response = DLM_MASTER_RESP_MAYBE; struct dlm_ctxt *dlm = data; @@ -1417,10 +1509,11 @@ way_up_top: /* take care of the easy cases up front */ spin_lock(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) { + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)) { spin_unlock(&res->spinlock); mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " - "being recovered\n"); + "being recovered/migrated\n"); response = DLM_MASTER_RESP_ERROR; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1428,8 +1521,10 @@ way_up_top: } if (res->owner == dlm->node_num) { + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, request->node_idx); + dlm_lockres_set_refmap_bit(request->node_idx, res); spin_unlock(&res->spinlock); - // mlog(0, "this node is the master\n"); response = DLM_MASTER_RESP_YES; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1477,7 +1572,6 @@ way_up_top: mlog(0, "node %u is master, but trying to migrate to " "node %u.\n", tmpmle->master, tmpmle->new_master); if (tmpmle->master == dlm->node_num) { - response = DLM_MASTER_RESP_YES; mlog(ML_ERROR, "no owner on lockres, but this " "node is trying to migrate it to %u?!\n", tmpmle->new_master); @@ -1494,6 +1588,10 @@ way_up_top: * go back and clean the mles on any * other nodes */ dispatch_assert = 1; + dlm_lockres_set_refmap_bit(request->node_idx, res); + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, + request->node_idx); } else response = DLM_MASTER_RESP_NO; } else { @@ -1607,17 +1705,24 @@ send_response: * can periodically run all locks owned by this node * and re-assert across the cluster... */ -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags) +int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags) { struct dlm_assert_master assert; int to, tmpret; struct dlm_node_iter iter; int ret = 0; int reassert; + const char *lockname = res->lockname.name; + unsigned int namelen = res->lockname.len; BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + again: reassert = 0; @@ -1647,6 +1752,7 @@ again: mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; + r = 0; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " @@ -1661,17 +1767,39 @@ again: spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); BUG(); - } else if (r == EAGAIN) { + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT && + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { + mlog(ML_ERROR, "%.*s: very strange, " + "master MLE but no lockres on %u\n", + namelen, lockname, to); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT) { mlog(0, "%.*s: node %u create mles on other " "nodes and requests a re-assert\n", namelen, lockname, to); reassert = 1; } + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { + mlog(0, "%.*s: node %u has a reference to this " + "lockres, set the bit in the refmap\n", + namelen, lockname, to); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(to, res); + spin_unlock(&res->spinlock); + } } if (reassert) goto again; + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + return ret; } @@ -1684,7 +1812,8 @@ again: * * if possible, TRIM THIS DOWN!!! */ -int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_master_list_entry *mle = NULL; @@ -1693,7 +1822,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) char *name; unsigned int namelen, hash; u32 flags; - int master_request = 0; + int master_request = 0, have_lockres_ref = 0; int ret = 0; if (!dlm_grab(dlm)) @@ -1851,6 +1980,7 @@ ok: spin_unlock(&mle->spinlock); if (res) { + int wake = 0; spin_lock(&res->spinlock); if (mle->type == DLM_MLE_MIGRATION) { mlog(0, "finishing off migration of lockres %.*s, " @@ -1858,12 +1988,16 @@ ok: res->lockname.len, res->lockname.name, dlm->node_num, mle->new_master); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; dlm_change_lockres_owner(dlm, res, mle->new_master); BUG_ON(res->state & DLM_LOCK_RES_DIRTY); } else { dlm_change_lockres_owner(dlm, res, mle->master); } spin_unlock(&res->spinlock); + have_lockres_ref = 1; + if (wake) + wake_up(&res->wq); } /* master is known, detach if not already detached. @@ -1913,12 +2047,28 @@ ok: done: ret = 0; - if (res) - dlm_lockres_put(res); + if (res) { + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + *ret_data = (void *)res; + } dlm_put(dlm); if (master_request) { mlog(0, "need to tell master to reassert\n"); - ret = EAGAIN; // positive. negative would shoot down the node. + /* positive. negative would shoot down the node. */ + ret |= DLM_ASSERT_RESPONSE_REASSERT; + if (!have_lockres_ref) { + mlog(ML_ERROR, "strange, got assert from %u, MASTER " + "mle present here for %s:%.*s, but no lockres!\n", + assert->node_idx, dlm->name, namelen, name); + } + } + if (have_lockres_ref) { + /* let the master know we have a reference to the lockres */ + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", + dlm->name, namelen, name, assert->node_idx); } return ret; @@ -1929,17 +2079,31 @@ kill: __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); - dlm_lockres_put(res); + *ret_data = (void *)res; dlm_put(dlm); return -EINVAL; } +void dlm_assert_master_post_handler(int status, void *data, void *ret_data) +{ + struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; + + if (ret_data) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + return; +} + int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kcalloc(1, sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (!item) return -ENOMEM; @@ -2023,9 +2187,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, dlm->node_num); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, - nodemap, flags); + ret = dlm_do_assert_master(dlm, res, nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ if (!dlm_is_host_down(ret)) @@ -2097,14 +2259,180 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, return ret; } +/* + * DLM_DEREF_LOCKRES_MSG + */ + +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct dlm_deref_lockres deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + mlog(0, "%s:%.*s: sending deref to %d\n", + dlm->name, namelen, lockname, res->owner); + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, + &deref, sizeof(deref), res->owner, &r); + if (ret < 0) + mlog_errno(ret); + else if (r < 0) { + /* BAD. other node says I did not have a ref. */ + mlog(ML_ERROR,"while dropping ref on %s:%.*s " + "(master=%u) got %d.\n", dlm->name, namelen, + lockname, res->owner, r); + dlm_print_one_lock_resource(res); + BUG(); + } + return ret; +} + +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + struct dlm_work_item *item; + int cleared = 0; + int dispatch = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_SETREF_INPROG) + dispatch = 1; + else { + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + } + spin_unlock(&res->spinlock); + + if (!dispatch) { + if (cleared) + dlm_lockres_calc_usage(dlm, res); + else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + __dlm_print_one_lock_resource(res); + } + ret = 0; + goto done; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + ret = -ENOMEM; + mlog_errno(ret); + goto done; + } + + dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); + item->u.dl.deref_res = res; + item->u.dl.deref_node = node; + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; + +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + + return ret; +} + +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_lock_resource *res; + u8 node; + u8 cleared = 0; + + dlm = item->dlm; + res = item->u.dl.deref_res; + node = item->u.dl.deref_node; + + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + spin_unlock(&res->spinlock); + + if (cleared) { + mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", + dlm->name, res->lockname.len, res->lockname.name, node); + dlm_lockres_calc_usage(dlm, res); + } else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + __dlm_print_one_lock_resource(res); + } + + dlm_lockres_put(res); +} + /* * DLM_MIGRATE_LOCKRES */ -int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, - u8 target) +static int dlm_migrate_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target) { struct dlm_master_list_entry *mle = NULL; struct dlm_master_list_entry *oldmle = NULL; @@ -2116,7 +2444,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct list_head *queue, *iter; int i; struct dlm_lock *lock; - int empty = 1; + int empty = 1, wake = 0; if (!dlm_grab(dlm)) return -EINVAL; @@ -2241,6 +2569,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, res->lockname.name, target); spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; spin_unlock(&res->spinlock); ret = -EINVAL; } @@ -2268,6 +2597,9 @@ fail: * the lockres */ + /* now that remote nodes are spinning on the MIGRATING flag, + * ensure that all assert_master work is flushed. */ + flush_workqueue(dlm->dlm_worker); /* get an extra reference on the mle. * otherwise the assert_master from the new @@ -2296,6 +2628,7 @@ fail: dlm_put_mle_inuse(mle); spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; spin_unlock(&res->spinlock); goto leave; } @@ -2322,7 +2655,8 @@ fail: res->owner == target) break; - mlog(0, "timed out during migration\n"); + mlog(0, "%s:%.*s: timed out during migration\n", + dlm->name, res->lockname.len, res->lockname.name); /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { @@ -2330,20 +2664,20 @@ fail: "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); - ret = -ERESTARTSYS; + ret = -EINVAL; + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + goto leave; } - } - if (ret == -ERESTARTSYS) { - /* migration failed, detach and clean up mle */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - dlm_put_mle_inuse(mle); - spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - spin_unlock(&res->spinlock); - goto leave; - } - /* TODO: if node died: stop, clean up, return error */ + } else + mlog(0, "%s:%.*s: caught signal during migration\n", + dlm->name, res->lockname.len, res->lockname.name); } /* all done, set the owner, clear the flag */ @@ -2366,6 +2700,11 @@ leave: if (ret < 0) dlm_kick_thread(dlm, res); + /* wake up waiters if the MIGRATING flag got set + * but migration failed */ + if (wake) + wake_up(&res->wq); + /* TODO: cleanup */ if (mres) free_page((unsigned long)mres); @@ -2376,6 +2715,53 @@ leave: return ret; } +#define DLM_MIGRATION_RETRY_MS 100 + +/* Should be called only after beginning the domain leave process. + * There should not be any remaining locks on nonlocal lock resources, + * and there should be no local locks left on locally mastered resources. + * + * Called with the dlm spinlock held, may drop it to do migration, but + * will re-acquire before exit. + * + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + int lock_dropped = 0; + + if (res->owner != dlm->node_num) { + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s:%.*s: this node is not master, " + "trying to free this but locks remain\n", + dlm->name, res->lockname.len, res->lockname.name); + } + goto leave; + } + + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ + spin_unlock(&dlm->spinlock); + lock_dropped = 1; + while (1) { + ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); + if (ret >= 0) + break; + if (ret == -ENOTEMPTY) { + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", + res->lockname.len, res->lockname.name); + BUG(); + } + + mlog(0, "lockres %.*s: migrate failed, " + "retrying\n", res->lockname.len, + res->lockname.name); + msleep(DLM_MIGRATION_RETRY_MS); + } + spin_lock(&dlm->spinlock); +leave: + return lock_dropped; +} + int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) { int ret; @@ -2405,7 +2791,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, return can_proceed; } -int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { int ret; spin_lock(&res->spinlock); @@ -2434,8 +2821,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, __dlm_lockres_reserve_ast(res); spin_unlock(&res->spinlock); - /* now flush all the pending asts.. hang out for a bit */ + /* now flush all the pending asts */ dlm_kick_thread(dlm, res); + /* before waiting on DIRTY, block processes which may + * try to dirty the lockres before MIGRATING is set */ + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); + res->state |= DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); + /* now wait on any pending asts and the DIRTY state */ wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); dlm_lockres_release_ast(dlm, res); @@ -2461,6 +2855,13 @@ again: mlog(0, "trying again...\n"); goto again; } + /* now that we are sure the MIGRATING state is there, drop + * the unneded state which blocked threads trying to DIRTY */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); /* did the target go down or die? */ spin_lock(&dlm->spinlock); @@ -2490,7 +2891,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, { struct list_head *iter, *iter2; struct list_head *queue = &res->granted; - int i; + int i, bit; struct dlm_lock *lock; assert_spin_locked(&res->spinlock); @@ -2508,12 +2909,28 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, BUG_ON(!list_empty(&lock->bast_list)); BUG_ON(lock->ast_pending); BUG_ON(lock->bast_pending); + dlm_lockres_clear_refmap_bit(lock->ml.node, res); list_del_init(&lock->list); dlm_lock_put(lock); } } queue++; } + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + /* do not clear the local node reference, if there is a + * process holding this, let it drop the ref itself */ + if (bit != dlm->node_num) { + mlog(0, "%s:%.*s: node %u had a ref to this " + "migrating lockres, clearing\n", dlm->name, + res->lockname.len, res->lockname.name, bit); + dlm_lockres_clear_refmap_bit(bit, res); + } + bit++; + } } /* for now this is not too intelligent. we will @@ -2601,6 +3018,16 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, mlog(0, "migrate request (node %u) returned %d!\n", nodenum, status); ret = status; + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { + /* during the migration request we short-circuited + * the mastery of the lockres. make sure we have + * a mastery ref for nodenum */ + mlog(0, "%s:%.*s: need ref for node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + nodenum); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(nodenum, res); + spin_unlock(&res->spinlock); } } @@ -2619,7 +3046,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, * we will have no mle in the list to start with. now we can add an mle for * the migration and this should be the only one found for those scanning the * list. */ -int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_lock_resource *res = NULL; @@ -2745,7 +3173,13 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); - __dlm_mle_detach_hb_events(dlm, mle); + /* this was obviously WRONG. mle is uninited here. should be tmp. */ + __dlm_mle_detach_hb_events(dlm, tmp); + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref for cleared out mle " + "during migration\n", dlm->name, namelen, name, + master, new_master); } spin_unlock(&tmp->spinlock); } @@ -2753,6 +3187,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* now add a migration mle to the tail of the list */ dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); mle->new_master = new_master; + /* the new master will be sending an assert master for this. + * at that point we will get the refmap reference */ mle->master = master; /* do this for consistency with other mle types */ set_bit(new_master, mle->maybe_map); @@ -2902,6 +3338,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, clear_bit(dlm->node_num, iter.node_map); spin_unlock(&dlm->spinlock); + /* ownership of the lockres is changing. account for the + * mastery reference here since old_master will briefly have + * a reference after the migration completes */ + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(old_master, res); + spin_unlock(&res->spinlock); + mlog(0, "now time to do a migrate request to other nodes\n"); ret = dlm_do_migrate_request(dlm, res, old_master, dlm->node_num, &iter); @@ -2914,8 +3357,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, res->lockname.len, res->lockname.name); /* this call now finishes out the nodemap * even if one or more nodes die */ - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { /* no longer need to retry. all living nodes contacted. */ @@ -2927,8 +3369,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { mlog(0, "assert master to original master failed " diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index fb3e2b0817f..6d4a83d5015 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -163,9 +163,6 @@ void dlm_dispatch_work(struct work_struct *work) dlm_workfunc_t *workfunc; int tot=0; - if (!dlm_joined(dlm)) - return; - spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); @@ -757,7 +754,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) } BUG_ON(num == dead_node); - ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); + ndata = kzalloc(sizeof(*ndata), GFP_NOFS); if (!ndata) { dlm_destroy_recovery_area(dlm, dead_node); return -ENOMEM; @@ -821,7 +818,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, } -int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; @@ -842,7 +840,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) } BUG_ON(lr->dead_node != dlm->reco.dead_node); - item = kcalloc(1, sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (!item) { dlm_put(dlm); return -ENOMEM; @@ -978,7 +976,8 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) } -int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; @@ -1129,6 +1128,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (total_locks == mres_total_locks) mres->flags |= DLM_MRES_ALL_DONE; + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + send_to); + /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, sz, send_to, &status); @@ -1213,6 +1217,34 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, return 0; } +static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres) +{ + struct dlm_lock dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.ml.cookie = 0; + dummy.ml.type = LKM_IVMODE; + dummy.ml.convert_type = LKM_IVMODE; + dummy.ml.highest_blocked = LKM_IVMODE; + dummy.lksb = NULL; + dummy.ml.node = dlm->node_num; + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); +} + +static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lock *ml, + u8 *nodenum) +{ + if (unlikely(ml->cookie == 0 && + ml->type == LKM_IVMODE && + ml->convert_type == LKM_IVMODE && + ml->highest_blocked == LKM_IVMODE && + ml->list == DLM_BLOCKED_LIST)) { + *nodenum = ml->node; + return 1; + } + return 0; +} int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_migratable_lockres *mres, @@ -1260,6 +1292,14 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, goto error; } } + if (total_locks == 0) { + /* send a dummy lock to indicate a mastery reference only */ + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", + dlm->name, res->lockname.len, res->lockname.name, + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : + "migration"); + dlm_add_dummy_lock(dlm, mres); + } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); if (ret < 0) @@ -1293,7 +1333,8 @@ error: * do we spin? returning an error only delays the problem really */ -int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_migratable_lockres *mres = @@ -1323,7 +1364,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) ret = -ENOMEM; buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); - item = kcalloc(1, sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (!buf || !item) goto leave; @@ -1382,17 +1423,21 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); + wake_up(&res->wq); /* add an extra ref for just-allocated lockres * otherwise the lockres will be purged immediately */ dlm_lockres_get(res); - } /* at this point we have allocated everything we need, * and we have a hashed lockres with an extra ref and * the proper res->state flags. */ ret = 0; + spin_lock(&res->spinlock); + /* drop this either when master requery finds a different master + * or when a lock is added by the recovery worker */ + dlm_lockres_grab_inflight_ref(dlm, res); if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { /* migration cannot have an unknown master */ BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); @@ -1400,10 +1445,11 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) "unknown owner.. will need to requery: " "%.*s\n", mres->lockname_len, mres->lockname); } else { - spin_lock(&res->spinlock); + /* take a reference now to pin the lockres, drop it + * when locks are added in the worker */ dlm_change_lockres_owner(dlm, res, dlm->node_num); - spin_unlock(&res->spinlock); } + spin_unlock(&res->spinlock); /* queue up work for dlm_mig_lockres_worker */ dlm_grab(dlm); /* get an extra ref for the work item */ @@ -1459,6 +1505,9 @@ again: "this node will take it.\n", res->lockname.len, res->lockname.name); } else { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); mlog(0, "master needs to respond to sender " "that node %u still owns %.*s\n", real_master, res->lockname.len, @@ -1578,7 +1627,8 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, /* this function cannot error, so unless the sending * or receiving of the message failed, the owner can * be trusted */ -int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; @@ -1660,21 +1710,38 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, { struct dlm_migratable_lock *ml; struct list_head *queue; + struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i, bad; + int i, j, bad; struct list_head *iter; struct dlm_lock *lock = NULL; + u8 from = O2NM_MAX_NODES; + unsigned int added = 0; mlog(0, "running %d locks for this lockres\n", mres->num_locks); for (i=0; i<mres->num_locks; i++) { ml = &(mres->ml[i]); + + if (dlm_is_dummy_lock(dlm, ml, &from)) { + /* placeholder, just need to set the refmap bit */ + BUG_ON(mres->num_locks != 1); + mlog(0, "%s:%.*s: dummy lock for %u\n", + dlm->name, mres->lockname_len, mres->lockname, + from); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(from, res); + spin_unlock(&res->spinlock); + added++; + break; + } BUG_ON(ml->highest_blocked != LKM_IVMODE); newlock = NULL; lksb = NULL; queue = dlm_list_num_to_pointer(res, ml->list); + tmpq = NULL; /* if the lock is for the local node it needs to * be moved to the proper location within the queue. @@ -1684,11 +1751,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); spin_lock(&res->spinlock); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { + tmpq = dlm_list_idx_to_ptr(res, j); + list_for_each(iter, tmpq) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.cookie != ml->cookie) + lock = NULL; + else + break; + } + if (lock) break; } @@ -1698,12 +1770,20 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, u64 c = ml->cookie; mlog(ML_ERROR, "could not find local lock " "with cookie %u:%llu!\n", - dlm_get_lock_cookie_node(c), - dlm_get_lock_cookie_seq(c)); + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); + __dlm_print_one_lock_resource(res); BUG(); } BUG_ON(lock->ml.node != ml->node); + if (tmpq != queue) { + mlog(0, "lock was on %u instead of %u for %.*s\n", + j, ml->list, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + continue; + } + /* see NOTE above about why we do not update * to match the master here */ @@ -1711,6 +1791,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); + added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -1799,14 +1880,14 @@ skip_lvb: mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " "exists on this lockres!\n", dlm->name, res->lockname.len, res->lockname.name, - dlm_get_lock_cookie_node(c), - dlm_get_lock_cookie_seq(c)); + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " "node=%u, cookie=%u:%llu, queue=%d\n", ml->type, ml->convert_type, ml->node, - dlm_get_lock_cookie_node(ml->cookie), - dlm_get_lock_cookie_seq(ml->cookie), + dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), ml->list); __dlm_print_one_lock_resource(res); @@ -1817,12 +1898,22 @@ skip_lvb: if (!bad) { dlm_lock_get(newlock); list_add_tail(&newlock->list, queue); + mlog(0, "%s:%.*s: added lock for node %u, " + "setting refmap bit\n", dlm->name, + res->lockname.len, res->lockname.name, ml->node); + dlm_lockres_set_refmap_bit(ml->node, res); + added++; } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); leave: + /* balance the ref taken when the work was queued */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + if (ret < 0) { mlog_errno(ret); if (newlock) @@ -1935,9 +2026,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, if (res->owner == dead_node) { list_del_init(&res->recovering); spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -1977,9 +2070,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, dlm_lockres_put(res); } spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -2048,6 +2143,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct list_head *iter, *tmpiter; struct dlm_lock *lock; + unsigned int freed = 0; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2062,6 +2158,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->converting) { @@ -2069,6 +2166,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->blocked) { @@ -2076,9 +2174,23 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } + if (freed) { + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " + "dropping ref from lockres\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + BUG_ON(!test_bit(dead_node, res->refmap)); + dlm_lockres_clear_refmap_bit(dead_node, res); + } else if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", dlm->name, + res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dead_node, res); + } + /* do not kick thread yet */ __dlm_dirty_lockres(dlm, res); } @@ -2141,9 +2253,21 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); - if (res->owner == dead_node) + if (res->owner == dead_node) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + + /* the wake_up for this will happen when the + * RECOVERING flag is dropped later */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, res); - else if (res->owner == dlm->node_num) { + } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); } @@ -2480,7 +2604,8 @@ retry: return ret; } -int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; @@ -2608,7 +2733,8 @@ stage2: return ret; } -int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 0c822f3ffb0..8ffa0916eb8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -54,9 +54,6 @@ #include "cluster/masklog.h" static int dlm_thread(void *data); -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); - static void dlm_flush_asts(struct dlm_ctxt *dlm); #define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) @@ -82,14 +79,33 @@ repeat: current->state = TASK_RUNNING; } - -int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_has_locks(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && - list_empty(&res->blocked) && - list_empty(&res->dirty)) - return 1; + list_empty(&res->blocked)) + return 0; + return 1; +} + +/* "unused": the lockres has no locks, is not on the dirty list, + * has no inflight locks (in the gap between mastery and acquiring + * the first lock), and has no bits in its refmap. + * truly ready to be freed. */ +int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + if (!__dlm_lockres_has_locks(res) && + (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { + /* try not to scan the bitmap unless the first two + * conditions are already true */ + int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* since the bit for dlm->node_num is not + * set, inflight_locks better be zero */ + BUG_ON(res->inflight_locks != 0); + return 1; + } + } return 0; } @@ -106,46 +122,21 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ - /* For now, just keep any resource we master */ - if (res->owner == dlm->node_num) - { - if (!list_empty(&res->purge)) { - mlog(0, "we master %s:%.*s, but it is on " - "the purge list. Removing\n", - dlm->name, res->lockname.len, - res->lockname.name); - list_del_init(&res->purge); - dlm->purge_count--; - } - return; - } - if (list_empty(&res->purge)) { - mlog(0, "putting lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "putting lockres %.*s:%p onto purge list\n", + res->lockname.len, res->lockname.name, res); res->last_used = jiffies; + dlm_lockres_get(res); list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; - - /* if this node is not the owner, there is - * no way to keep track of who the owner could be. - * unhash it to avoid serious problems. */ - if (res->owner != dlm->node_num) { - mlog(0, "%s:%.*s: doing immediate " - "purge of lockres owned by %u\n", - dlm->name, res->lockname.len, - res->lockname.name, res->owner); - - dlm_purge_lockres_now(dlm, res); - } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list, " - "owner=%u\n", res->lockname.len, res->lockname.name, - res->owner); + mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", + res->lockname.len, res->lockname.name, res, res->owner); list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } } @@ -163,68 +154,65 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } -/* TODO: Eventual API: Called with the dlm spinlock held, may drop it - * to do migration, but will re-acquire before exit. */ -void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) +static int dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { int master; - int ret; - - spin_lock(&lockres->spinlock); - master = lockres->owner == dlm->node_num; - spin_unlock(&lockres->spinlock); + int ret = 0; - mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, - lockres->lockname.name, master); - - /* Non master is the easy case -- no migration required, just - * quit. */ + spin_lock(&res->spinlock); + if (!__dlm_lockres_unused(res)) { + spin_unlock(&res->spinlock); + mlog(0, "%s:%.*s: tried to purge but not unused\n", + dlm->name, res->lockname.len, res->lockname.name); + return -ENOTEMPTY; + } + master = (res->owner == dlm->node_num); if (!master) - goto finish; - - /* Wheee! Migrate lockres here! */ - spin_unlock(&dlm->spinlock); -again: + res->state |= DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); - ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); - if (ret == -ENOTEMPTY) { - mlog(ML_ERROR, "lockres %.*s still has local locks!\n", - lockres->lockname.len, lockres->lockname.name); + mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, + res->lockname.name, master); - BUG(); - } else if (ret < 0) { - mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", - lockres->lockname.len, lockres->lockname.name); - msleep(100); - goto again; + if (!master) { + spin_lock(&res->spinlock); + /* This ensures that clear refmap is sent after the set */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + spin_unlock(&res->spinlock); + /* drop spinlock to do messaging, retake below */ + spin_unlock(&dlm->spinlock); + /* clear our bit from the master's refmap, ignore errors */ + ret = dlm_drop_lockres_ref(dlm, res); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + } + mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", + dlm->name, res->lockname.len, res->lockname.name, ret); + spin_lock(&dlm->spinlock); } - spin_lock(&dlm->spinlock); - -finish: - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); + if (!list_empty(&res->purge)) { + mlog(0, "removing lockres %.*s:%p from purgelist, " + "master = %d\n", res->lockname.len, res->lockname.name, + res, master); + list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } - __dlm_unhash_lockres(lockres); -} - -/* make an unused lockres go away immediately. - * as soon as the dlm spinlock is dropped, this lockres - * will not be found. kfree still happens on last put. */ -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres) -{ - assert_spin_locked(&dlm->spinlock); - assert_spin_locked(&lockres->spinlock); + __dlm_unhash_lockres(res); - BUG_ON(!__dlm_lockres_unused(lockres)); - - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); - dlm->purge_count--; + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. */ + if (!master) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); } - __dlm_unhash_lockres(lockres); + return 0; } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -268,13 +256,17 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, break; } + mlog(0, "removing lockres %.*s:%p from purgelist\n", + lockres->lockname.len, lockres->lockname.name, lockres); list_del_init(&lockres->purge); + dlm_lockres_put(lockres); dlm->purge_count--; /* This may drop and reacquire the dlm spinlock if it * has to do migration. */ mlog(0, "calling dlm_purge_lockres!\n"); - dlm_purge_lockres(dlm, lockres); + if (dlm_purge_lockres(dlm, lockres)) + BUG(); mlog(0, "DONE calling dlm_purge_lockres!\n"); /* Avoid adding any scheduling latencies */ @@ -467,12 +459,17 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num) && - !(res->state & DLM_LOCK_RES_DIRTY)) { - /* ref for dirty_list */ - dlm_lockres_get(res); - list_add_tail(&res->dirty, &dlm->dirty_list); - res->state |= DLM_LOCK_RES_DIRTY; + if ((res->owner == dlm->node_num)) { + if (res->state & (DLM_LOCK_RES_MIGRATING | + DLM_LOCK_RES_BLOCK_DIRTY)) + return; + + if (list_empty(&res->dirty)) { + /* ref for dirty_list */ + dlm_lockres_get(res); + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + } } } @@ -651,7 +648,7 @@ static int dlm_thread(void *data) dlm_lockres_get(res); spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_DIRTY; + /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */ list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); @@ -675,10 +672,11 @@ static int dlm_thread(void *data) /* it is now ok to move lockreses in these states * to the dirty list, assuming that they will only be * dirty for a short while. */ + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_RECOVERING)) { /* move it to the tail and keep going */ + res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); mlog(0, "delaying list shuffling for in-" "progress lockres %.*s, state=%d\n", @@ -699,6 +697,7 @@ static int dlm_thread(void *data) /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); + res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); dlm_lockres_calc_usage(dlm, res); @@ -709,11 +708,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { - /* ref for dirty_list */ - dlm_lockres_get(res); spin_lock(&res->spinlock); - list_add_tail(&res->dirty, &dlm->dirty_list); - res->state |= DLM_LOCK_RES_DIRTY; + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); } dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 37be4b2e0d4..86ca085ef32 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -147,6 +147,10 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, goto leave; } + if (res->state & DLM_LOCK_RES_MIGRATING) { + status = DLM_MIGRATING; + goto leave; + } /* see above for what the spec says about * LKM_CANCEL and the lock queue state */ @@ -244,8 +248,8 @@ leave: /* this should always be coupled with list removal */ BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); mlog(0, "lock %u:%llu should be gone now! refs=%d\n", - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), atomic_read(&lock->lock_refs.refcount)-1); dlm_lock_put(lock); } @@ -379,7 +383,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, * return value from dlmunlock_master */ -int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; @@ -502,8 +507,8 @@ not_found: if (!found) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", - dlm_get_lock_cookie_node(unlock->cookie), - dlm_get_lock_cookie_seq(unlock->cookie)); + dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie))); else dlm_lock_put(lock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 69fba16efbd..e335541727f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -770,7 +770,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, int dlm_flags) { int ret = 0; - enum dlm_status status; + enum dlm_status status = DLM_NORMAL; unsigned long flags; mlog_entry_void(); @@ -1138,6 +1138,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) { int status, level; struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); @@ -1147,6 +1148,9 @@ int ocfs2_rw_lock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); + if (ocfs2_mount_local(osb)) + return 0; + lockres = &OCFS2_I(inode)->ip_rw_lockres; level = write ? LKM_EXMODE : LKM_PRMODE; @@ -1164,6 +1168,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1171,7 +1176,8 @@ void ocfs2_rw_unlock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); } @@ -1182,6 +1188,7 @@ int ocfs2_data_lock_full(struct inode *inode, { int status = 0, level; struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); @@ -1201,6 +1208,9 @@ int ocfs2_data_lock_full(struct inode *inode, goto out; } + if (ocfs2_mount_local(osb)) + goto out; + lockres = &OCFS2_I(inode)->ip_data_lockres; level = write ? LKM_EXMODE : LKM_PRMODE; @@ -1269,6 +1279,7 @@ void ocfs2_data_unlock(struct inode *inode, { int level = write ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1276,7 +1287,8 @@ void ocfs2_data_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); @@ -1467,8 +1479,9 @@ static int ocfs2_meta_lock_update(struct inode *inode, { int status = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres; + struct ocfs2_lock_res *lockres = NULL; struct ocfs2_dinode *fe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1483,10 +1496,12 @@ static int ocfs2_meta_lock_update(struct inode *inode, } spin_unlock(&oi->ip_lock); - lockres = &oi->ip_meta_lockres; + if (!ocfs2_mount_local(osb)) { + lockres = &oi->ip_meta_lockres; - if (!ocfs2_should_refresh_lock_res(lockres)) - goto bail; + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + } /* This will discard any caching information we might have had * for the inode metadata. */ @@ -1496,7 +1511,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, * map (directories, bitmap files, etc) */ ocfs2_extent_map_trunc(inode, 0); - if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { + if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); ocfs2_refresh_inode_from_lvb(inode); @@ -1543,7 +1558,8 @@ static int ocfs2_meta_lock_update(struct inode *inode, status = 0; bail_refresh: - ocfs2_complete_lock_res_refresh(lockres, status); + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, status); bail: mlog_exit(status); return status; @@ -1585,7 +1601,7 @@ int ocfs2_meta_lock_full(struct inode *inode, int arg_flags) { int status, level, dlm_flags, acquired; - struct ocfs2_lock_res *lockres; + struct ocfs2_lock_res *lockres = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; @@ -1607,6 +1623,9 @@ int ocfs2_meta_lock_full(struct inode *inode, goto bail; } + if (ocfs2_mount_local(osb)) + goto local; + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); @@ -1636,6 +1655,7 @@ int ocfs2_meta_lock_full(struct inode *inode, wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); +local: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -1644,7 +1664,8 @@ int ocfs2_meta_lock_full(struct inode *inode, */ if (inode->i_state & I_NEW) { status = 0; - ocfs2_complete_lock_res_refresh(lockres, 0); + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, 0); goto bail; } @@ -1767,6 +1788,7 @@ void ocfs2_meta_unlock(struct inode *inode, { int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1774,7 +1796,8 @@ void ocfs2_meta_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); @@ -1783,7 +1806,7 @@ void ocfs2_meta_unlock(struct inode *inode, int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { - int status; + int status = 0; int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; struct buffer_head *bh; @@ -1794,6 +1817,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb, if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + goto bail; + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) { mlog_errno(status); @@ -1832,7 +1858,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb, int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; - ocfs2_cluster_unlock(osb, lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); } int ocfs2_rename_lock(struct ocfs2_super *osb) @@ -1843,6 +1870,9 @@ int ocfs2_rename_lock(struct ocfs2_super *osb) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + return 0; + status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); if (status < 0) mlog_errno(status); @@ -1854,7 +1884,8 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; - ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); } int ocfs2_dentry_lock(struct dentry *dentry, int ex) @@ -1869,6 +1900,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + return 0; + ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0); if (ret < 0) mlog_errno(ret); @@ -1882,7 +1916,8 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex) struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); } /* Reference counting of the dlm debug structure. We want this because @@ -2145,12 +2180,15 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) int ocfs2_dlm_init(struct ocfs2_super *osb) { - int status; + int status = 0; u32 dlm_key; - struct dlm_ctxt *dlm; + struct dlm_ctxt *dlm = NULL; mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto local; + status = ocfs2_dlm_init_debug(osb); if (status < 0) { mlog_errno(status); @@ -2178,11 +2216,12 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } + dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); + +local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); - osb->dlm = dlm; status = 0; @@ -2679,6 +2718,15 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; + /* + * We need this before the filemap_fdatawrite() so that it can + * transfer the dirty bit from the PTE to the + * page. Unfortunately this means that even for EX->PR + * downconverts, we'll lose our mappings and have to build + * them up again. + */ + unmap_mapping_range(mapping, 0, 0, 0); + if (filemap_fdatawrite(mapping)) { mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2686,7 +2734,6 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, sync_mapping_buffers(mapping); if (blocking == LKM_EXMODE) { truncate_inode_pages(mapping, 0); - unmap_mapping_range(mapping, 0, 0, 0); } else { /* We only need to wait on the I/O if we're not also * truncating pages because truncate_inode_pages waits diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 06be6e774cf..56e1fefc120 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -60,14 +60,11 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); - if (IS_ERR(inode)) { - mlog_errno(PTR_ERR(inode)); + if (IS_ERR(inode)) return (void *)inode; - } if (handle->ih_generation != inode->i_generation) { iput(inode); - mlog_errno(-ESTALE); return ERR_PTR(-ESTALE); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8786b3c490a..10953a508f2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -68,7 +68,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_dentry->d_name.len, file->f_dentry->d_name.name); + file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); spin_lock(&oi->ip_lock); @@ -98,8 +98,8 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_dentry->d_name.len, - file->f_dentry->d_name.name); + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); spin_lock(&oi->ip_lock); if (!--oi->ip_open_count) @@ -149,10 +149,29 @@ int ocfs2_should_update_atime(struct inode *inode, ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; + /* + * We can be called with no vfsmnt structure - NFSD will + * sometimes do this. + * + * Note that our action here is different than touch_atime() - + * if we can't tell whether this is a noatime mount, then we + * don't know whether to trust the value of s_atime_quantum. + */ + if (vfsmnt == NULL) + return 0; + if ((vfsmnt->mnt_flags & MNT_NOATIME) || ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; + if (vfsmnt->mnt_flags & MNT_RELATIME) { + if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + return 1; + + return 0; + } + now = CURRENT_TIME; if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) return 0; @@ -958,8 +977,6 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) } ret = generic_permission(inode, mask, NULL); - if (ret) - mlog_errno(ret); ocfs2_meta_unlock(inode, 0); out: @@ -1131,13 +1148,13 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, { int ret, rw_level, have_alloc_sem = 0; struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int appending = filp->f_flags & O_APPEND ? 1 : 0; mlog_entry("(0x%p, %u, '%.*s')\n", filp, (unsigned int)nr_segs, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name); /* happy write of zero bytes */ if (iocb->ki_left == 0) @@ -1159,7 +1176,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, goto out; } - ret = ocfs2_prepare_inode_for_write(filp->f_dentry, &iocb->ki_pos, + ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, iocb->ki_left, appending); if (ret < 0) { mlog_errno(ret); @@ -1207,12 +1224,12 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, unsigned int flags) { int ret; - struct inode *inode = out->f_dentry->d_inode; + struct inode *inode = out->f_path.dentry->d_inode; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, (unsigned int)len, - out->f_dentry->d_name.len, - out->f_dentry->d_name.name); + out->f_path.dentry->d_name.len, + out->f_path.dentry->d_name.name); inode_double_lock(inode, pipe->inode); @@ -1222,7 +1239,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out; } - ret = ocfs2_prepare_inode_for_write(out->f_dentry, ppos, len, 0); + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -1247,12 +1264,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in, unsigned int flags) { int ret = 0; - struct inode *inode = in->f_dentry->d_inode; + struct inode *inode = in->f_path.dentry->d_inode; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, (unsigned int)len, - in->f_dentry->d_name.len, - in->f_dentry->d_name.name); + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name); /* * See the comment in ocfs2_file_aio_read() @@ -1278,12 +1295,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, { int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; mlog_entry("(0x%p, %u, '%.*s')\n", filp, (unsigned int)nr_segs, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name); if (!inode) { ret = -EINVAL; diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index cbfd45a97a6..8fc52d6d0ce 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -154,6 +154,9 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) { int status; + if (ocfs2_mount_local(osb)) + return 0; + status = o2hb_register_callback(&osb->osb_hb_down); if (status < 0) { mlog_errno(status); @@ -172,6 +175,9 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) { int status; + if (ocfs2_mount_local(osb)) + return; + status = o2hb_unregister_callback(&osb->osb_hb_down); if (status < 0) mlog_errno(status); @@ -186,6 +192,9 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb) int ret; char *argv[5], *envp[3]; + if (ocfs2_mount_local(osb)) + return; + if (!osb->uuid_str) { /* This can happen if we don't get far enough in mount... */ mlog(0, "No UUID with which to stop heartbeat!\n\n"); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 42e361f3054..28ab56f2b98 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -146,7 +146,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) if (is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(-ESTALE); - mlog_errno(PTR_ERR(inode)); goto bail; } @@ -155,8 +154,7 @@ bail: mlog(0, "returning inode with number %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog_exit_ptr(inode); - } else - mlog_errno(PTR_ERR(inode)); + } return inode; } @@ -247,7 +245,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, * today. change if needed. */ if (!OCFS2_IS_VALID_DINODE(fe) || !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " + mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " "signature = %.*s, flags = 0x%x\n", inode->i_ino, (unsigned long long)le64_to_cpu(fe->i_blkno), 7, @@ -423,7 +421,8 @@ static int ocfs2_read_locked_inode(struct inode *inode, * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) - && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK); + && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) + && !ocfs2_mount_local(osb); /* * To maintain backwards compatibility with older versions of @@ -477,11 +476,8 @@ static int ocfs2_read_locked_inode(struct inode *inode, S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); - if (ocfs2_populate_inode(inode, fe, 0) < 0) { - mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", - (unsigned long long)fe->i_blkno, inode->i_ino); + if (ocfs2_populate_inode(inode, fe, 0) < 0) goto bail; - } BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1d7f4ab1e5e..825cb0ae1b4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -144,8 +144,10 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) ocfs2_abort(osb->sb, "Detected aborted journal"); handle = ERR_PTR(-EROFS); } - } else - atomic_inc(&(osb->journal->j_num_trans)); + } else { + if (!ocfs2_mount_local(osb)) + atomic_inc(&(osb->journal->j_num_trans)); + } return handle; } @@ -507,9 +509,23 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); - status = ocfs2_journal_toggle_dirty(osb, 0); - if (status < 0) - mlog_errno(status); + if (ocfs2_mount_local(osb)) { + journal_lock_updates(journal->j_journal); + status = journal_flush(journal->j_journal); + journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + + if (status == 0) { + /* + * Do not toggle if flush was unsuccessful otherwise + * will leave dirty metadata in a "clean" journal + */ + status = ocfs2_journal_toggle_dirty(osb, 0); + if (status < 0) + mlog_errno(status); + } /* Shutdown the kernel journal system */ journal_destroy(journal->j_journal); @@ -549,7 +565,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb, } } -int ocfs2_journal_load(struct ocfs2_journal *journal) +int ocfs2_journal_load(struct ocfs2_journal *journal, int local) { int status = 0; struct ocfs2_super *osb; @@ -576,14 +592,18 @@ int ocfs2_journal_load(struct ocfs2_journal *journal) } /* Launch the commit thread */ - osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt"); - if (IS_ERR(osb->commit_task)) { - status = PTR_ERR(osb->commit_task); + if (!local) { + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, + "ocfs2cmt"); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, " + "error=%d", status); + goto done; + } + } else osb->commit_task = NULL; - mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", - status); - goto done; - } done: mlog_exit(status); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 899112ad813..d026b4f2775 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -157,7 +157,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); -int ocfs2_journal_load(struct ocfs2_journal *journal); +int ocfs2_journal_load(struct ocfs2_journal *journal, int local); int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); @@ -174,6 +174,9 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + if (ocfs2_mount_local(osb)) + return; + if (!ocfs2_inode_fully_checkpointed(inode)) { /* WARNING: This only kicks off a single * checkpoint. If someone races you and adds more @@ -303,8 +306,8 @@ int ocfs2_journal_dirty_data(handle_t *handle, * for the dinode, one for the new block. */ #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) -/* file update (nlink, etc) + dir entry block */ -#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ +#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) /* inode + dir inode (if we unlink a dir), + dir entry block + orphan * dir inode link */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 698d79a74ef..4dedd978910 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -776,7 +776,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, { int status; - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 69f85ae392d..51b02044768 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -83,10 +83,12 @@ static struct vm_operations_struct ocfs2_file_vm_ops = { int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { int ret = 0, lock_level = 0; + struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); /* We don't want to support shared writable mappings yet. */ - if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) - && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { + if (!ocfs2_mount_local(osb) && + ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && + ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); /* This is -EINVAL because generic_file_readonly_mmap * returns it in a similar situation. */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 21db45ddf14..f3d7803b4b4 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -587,9 +587,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } ocfs2_inode_set_new(osb, inode); - status = ocfs2_create_new_inode_locks(inode); - if (status < 0) - mlog_errno(status); + if (!ocfs2_mount_local(osb)) { + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + } status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ @@ -930,14 +932,15 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - if (S_ISDIR(inode->i_mode)) { + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) drop_nlink(dir); - status = ocfs2_mark_inode_dirty(handle, dir, - parent_node_bh); - if (status < 0) { - mlog_errno(status); + + status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh); + if (status < 0) { + mlog_errno(status); + if (S_ISDIR(inode->i_mode)) inc_nlink(dir); - } } leave: @@ -1066,6 +1069,7 @@ static int ocfs2_rename(struct inode *old_dir, char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *orphan_entry_bh = NULL; struct buffer_head *newfe_bh = NULL; + struct buffer_head *old_inode_bh = NULL; struct buffer_head *insert_entry_bh = NULL; struct ocfs2_super *osb = NULL; u64 newfe_blkno; @@ -1077,7 +1081,7 @@ static int ocfs2_rename(struct inode *old_dir, struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, // this is the 1st dirent bh - nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; + nlink_t old_dir_nlink = old_dir->i_nlink; /* At some point it might be nice to break this function up a * bit. */ @@ -1137,12 +1141,11 @@ static int ocfs2_rename(struct inode *old_dir, } /* - * Though we don't require an inode meta data update if - * old_inode is not a directory, we lock anyway here to ensure - * the vote thread on other nodes won't have to concurrently - * downconvert the inode and the dentry locks. + * Aside from allowing a meta data update, the locking here + * also ensures that the vote thread on other nodes won't have + * to concurrently downconvert the inode and the dentry locks. */ - status = ocfs2_meta_lock(old_inode, NULL, 1); + status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1353,6 +1356,7 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); + ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh); /* now that the name has been added to new_dir, remove the old name */ status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); @@ -1382,27 +1386,22 @@ static int ocfs2_rename(struct inode *old_dir, } } mark_inode_dirty(old_dir); - if (new_inode) + ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh); + if (new_inode) { mark_inode_dirty(new_inode); + ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh); + } - if (old_dir != new_dir) - if (new_dir_nlink != new_dir->i_nlink) { - if (!new_dir_bh) { - mlog(ML_ERROR, "need to change nlink for new " - "dir %llu from %d to %d but bh is NULL\n", - (unsigned long long)OCFS2_I(new_dir)->ip_blkno, - (int)new_dir_nlink, new_dir->i_nlink); - } else { - struct ocfs2_dinode *fe; - status = ocfs2_journal_access(handle, - new_dir, - new_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - fe = (struct ocfs2_dinode *) new_dir_bh->b_data; - fe->i_links_count = cpu_to_le16(new_dir->i_nlink); - status = ocfs2_journal_dirty(handle, new_dir_bh); - } - } + if (old_dir != new_dir) { + /* Keep the same times on both directories.*/ + new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + + /* + * This will also pick up the i_nlink change from the + * block above. + */ + ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh); + } if (old_dir_nlink != old_dir->i_nlink) { if (!old_dir_bh) { @@ -1453,6 +1452,8 @@ bail: iput(new_inode); if (newfe_bh) brelse(newfe_bh); + if (old_inode_bh) + brelse(old_inode_bh); if (old_dir_bh) brelse(old_dir_bh); if (new_dir_bh) @@ -1824,6 +1825,13 @@ static int __ocfs2_add_entry(handle_t *handle, (le16_to_cpu(de->rec_len) >= rec_len)) || (le16_to_cpu(de->rec_len) >= (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + status = ocfs2_journal_access(handle, dir, insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); /* By now the buffer is marked for journaling */ @@ -1846,7 +1854,6 @@ static int __ocfs2_add_entry(handle_t *handle, de->name_len = namelen; memcpy(de->name, name, namelen); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; dir->i_version++; status = ocfs2_journal_dirty(handle, insert_bh); retval = 0; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index b767fd7da6e..db8e77cd35d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -349,6 +349,11 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) return ret; } +static inline int ocfs2_mount_local(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); +} + #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 3330a5dc6be..e61e218f5e0 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -85,8 +85,8 @@ #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) -#define OCFS2_FEATURE_COMPAT_SUPP 0 -#define OCFS2_FEATURE_INCOMPAT_SUPP 0 +#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB +#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 /* @@ -96,6 +96,32 @@ */ #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 +/* + * tunefs sets this incompat flag before starting the resize and clears it + * at the end. This flag protects users from inadvertently mounting the fs + * after an aborted run without fsck-ing. + */ +#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 + +/* Used to denote a non-clustered volume */ +#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 + +/* Support for sparse allocation in b-trees */ +#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 + +/* + * backup superblock flag is used to indicate that this volume + * has backup superblocks. + */ +#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 + +/* The byte offset of the first backup block will be 1G. + * The following will be 4G, 16G, 64G, 256G and 1T. + */ +#define OCFS2_BACKUP_SB_START 1 << 30 + +/* the max backup superblock nums */ +#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 /* * Flags on ocfs2_dinode.i_flags @@ -554,6 +580,20 @@ static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) return size / sizeof(struct ocfs2_truncate_rec); } + +static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) +{ + u64 offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset >>= sb->s_blocksize_bits; + return offset; + } + + return 0; + +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { @@ -619,6 +659,19 @@ static inline int ocfs2_truncate_recs_per_inode(int blocksize) return size / sizeof(struct ocfs2_truncate_rec); } + +static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) +{ + uint64_t offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset /= blocksize; + return offset; + } + + return 0; +} #endif /* __KERNEL__ */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index aa6f5aadedc..2d3ac32cb74 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -175,7 +175,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct buffer_head *bh = NULL; struct ocfs2_slot_info *si; - si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL); + si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 000d71cca6c..6dbb1176275 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -488,7 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, int status; u32 slot; - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); @@ -530,7 +530,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, { int status; - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); @@ -595,7 +595,7 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, mlog_entry_void(); - *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4bf39540e65..6e300a88a47 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -508,6 +508,27 @@ bail: return status; } +static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) +{ + if (ocfs2_mount_local(osb)) { + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + mlog(ML_ERROR, "Cannot heartbeat on a locally " + "mounted device.\n"); + return -EINVAL; + } + } + + if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { + mlog(ML_ERROR, "Heartbeat has to be started to mount " + "a read-write clustered device.\n"); + return -EINVAL; + } + } + + return 0; +} + static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -516,16 +537,24 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; + char nodestr[8]; mlog_entry("%p, %p, %i", sb, data, silent); - /* for now we only have one cluster/node, make sure we see it - * in the heartbeat universe */ - if (!o2hb_check_local_node_heartbeating()) { + if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { status = -EINVAL; goto read_super_error; } + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (parsed_opt & OCFS2_MOUNT_HB_LOCAL) { + if (!o2hb_check_local_node_heartbeating()) { + status = -EINVAL; + goto read_super_error; + } + } + /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size); if (status < 0) { @@ -541,11 +570,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } brelse(bh); bh = NULL; - - if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { - status = -EINVAL; - goto read_super_error; - } osb->s_mount_opt = parsed_opt; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -588,21 +612,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } if (!ocfs2_is_hard_readonly(osb)) { - /* If this isn't a hard readonly mount, then we need - * to make sure that heartbeat is in a valid state, - * and that we mark ourselves soft readonly is -oro - * was specified. */ - if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { - mlog(ML_ERROR, "No heartbeat for device (%s)\n", - sb->s_id); - status = -EINVAL; - goto read_super_error; - } - if (sb->s_flags & MS_RDONLY) ocfs2_set_ro_flag(osb, 0); } + status = ocfs2_verify_heartbeat(osb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); if (!osb->osb_debug_root) { @@ -635,9 +654,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); - printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) " + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + + printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " "with %s data mode.\n", - osb->dev_str, osb->node_num, osb->slot_num, + osb->dev_str, nodestr, osb->slot_num, osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); @@ -999,7 +1023,11 @@ static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) /* XXX hold a ref on the node while mounte? easy enough, if * desirable. */ - osb->node_num = o2nm_this_node(); + if (ocfs2_mount_local(osb)) + osb->node_num = 0; + else + osb->node_num = o2nm_this_node(); + if (osb->node_num == O2NM_MAX_NODES) { mlog(ML_ERROR, "could not find this host's node number\n"); status = -ENOENT; @@ -1084,6 +1112,9 @@ static int ocfs2_mount_volume(struct super_block *sb) goto leave; } + if (ocfs2_mount_local(osb)) + goto leave; + /* This should be sent *after* we recovered our journal as it * will cause other nodes to unmark us as needing * recovery. However, we need to send it *before* dropping the @@ -1114,6 +1145,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { int tmp; struct ocfs2_super *osb = NULL; + char nodestr[8]; mlog_entry("(0x%p)\n", sb); @@ -1177,8 +1209,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); - printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n", - osb->dev_str, osb->node_num); + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + + printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", + osb->dev_str, nodestr); ocfs2_delete_osb(osb); kfree(osb); @@ -1194,7 +1231,7 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); - osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); + osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); if (osb->uuid_str == NULL) return -ENOMEM; @@ -1225,7 +1262,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_entry_void(); - osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL); + osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); if (!osb) { status = -ENOMEM; mlog_errno(status); @@ -1350,7 +1387,7 @@ static int ocfs2_initialize_super(struct super_block *sb, */ /* initialize our journal structure */ - journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL); + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); if (!journal) { mlog(ML_ERROR, "unable to alloc journal\n"); status = -ENOMEM; @@ -1536,6 +1573,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) { int status = 0; int dirty; + int local; struct ocfs2_dinode *local_alloc = NULL; /* only used if we * recover * ourselves. */ @@ -1563,8 +1601,10 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) "recovering volume.\n"); } + local = ocfs2_mount_local(osb); + /* will play back anything left in the journal. */ - ocfs2_journal_load(osb->journal); + ocfs2_journal_load(osb->journal, local); if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 957d6878b03..03b0191534d 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -158,8 +158,7 @@ static void *ocfs2_follow_link(struct dentry *dentry, } status = vfs_follow_link(nd, link); - if (status && status != -ENOENT) - mlog_errno(status); + bail: if (page) { kunmap(page); diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index 5b4dca79990..f30e63b9910 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -479,7 +479,7 @@ static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response { struct ocfs2_net_wait_ctxt *w; - w = kcalloc(1, sizeof(*w), GFP_NOFS); + w = kzalloc(sizeof(*w), GFP_NOFS); if (!w) { mlog_errno(-ENOMEM); goto bail; @@ -642,7 +642,7 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, BUG_ON(!ocfs2_is_valid_vote_request(type)); - request = kcalloc(1, sizeof(*request), GFP_NOFS); + request = kzalloc(sizeof(*request), GFP_NOFS); if (!request) { mlog_errno(-ENOMEM); } else { @@ -887,7 +887,7 @@ static inline int ocfs2_translate_response(int response) static int ocfs2_handle_response_message(struct o2net_msg *msg, u32 len, - void *data) + void *data, void **ret_data) { unsigned int response_id, node_num; int response_status; @@ -943,7 +943,7 @@ bail: static int ocfs2_handle_vote_message(struct o2net_msg *msg, u32 len, - void *data) + void *data, void **ret_data) { int status; struct ocfs2_super *osb = data; @@ -1000,11 +1000,14 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb) { int status = 0; + if (ocfs2_mount_local(osb)) + return 0; + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, osb->net_key, sizeof(struct ocfs2_response_msg), ocfs2_handle_response_message, - osb, &osb->osb_net_handlers); + osb, NULL, &osb->osb_net_handlers); if (status) { mlog_errno(status); goto bail; @@ -1014,7 +1017,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb) osb->net_key, sizeof(struct ocfs2_vote_msg), ocfs2_handle_vote_message, - osb, &osb->osb_net_handlers); + osb, NULL, &osb->osb_net_handlers); if (status) { mlog_errno(status); goto bail; |