From 890871be854b5f5e43e7ba2475f706209906cc24 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:24:52 -0400 Subject: Btrfs: switch extent_map to a rw lock There are two main users of the extent_map tree. The first is regular file inodes, where it is evenly spread between readers and writers. The second is the chunk allocation tree, which maps blocks from logical addresses to phyiscal ones, and it is 99.99% reads. The mapping tree is a point of lock contention during heavy IO workloads, so this commit switches things to a rw lock. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 30c9365861e..72e9fa3c31f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -36,7 +36,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - spin_lock_init(&tree->lock); + rwlock_init(&tree->lock); } /** @@ -222,7 +222,6 @@ int add_extent_mapping(struct extent_map_tree *tree, ret = -EEXIST; goto out; } - assert_spin_locked(&tree->lock); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; @@ -285,7 +284,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - assert_spin_locked(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); @@ -331,7 +329,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) int ret = 0; WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - assert_spin_locked(&tree->lock); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; return ret; -- cgit v1.2.3-70-g09d2 From a1ed835e1ab5795f91b198d08c43e2f56848dcf3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Sep 2009 12:27:37 -0400 Subject: Btrfs: Fix extent replacment race Data COW means that whenever we write to a file, we replace any old extent pointers with new ones. There was a window where a readpage might find the old extent pointers on disk and cache them in the extent_map tree in ram in the middle of a given write replacing them. Even though both the readpage and the write had their respective bytes in the file locked, the extent readpage inserts may cover more bytes than it had locked down. This commit closes the race by keeping the new extent pinned in the extent map tree until after the on-disk btree is properly setup with the new extent pointers. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_map.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_map.h | 1 + fs/btrfs/file.c | 8 +++++--- fs/btrfs/inode.c | 25 ++++++++++++++++++++----- fs/btrfs/ioctl.c | 5 ++--- fs/btrfs/tree-log.c | 2 +- 7 files changed, 80 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 38eeb6c49c8..1ceab8b4d6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2292,7 +2292,7 @@ extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_block); + u64 inline_limit, u64 *hint_block, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 72e9fa3c31f..5bc7a0d325e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(em->start != start || !em); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 6216dfbcf9b..d3d442f4bbb 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -59,4 +59,5 @@ struct extent_map *alloc_extent_map(gfp_t mask); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ef66c3d989b..4123db9d514 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -177,10 +177,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - write_unlock(&em_tree->lock); if (em->start <= start && (!testend || em->start + em->len >= start + len)) { free_extent_map(em); + write_unlock(&em_tree->lock); break; } if (start < em->start) { @@ -190,6 +190,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, start = em->start + em->len; } free_extent_map(em); + write_unlock(&em_tree->lock); continue; } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -269,7 +270,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_byte) + u64 inline_limit, u64 *hint_byte, int drop_cache) { u64 extent_end = 0; u64 search_start = start; @@ -294,7 +295,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, int ret; inline_limit = 0; - btrfs_drop_extent_cache(inode, start, end - 1, 0); + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 739a245e25d..233fe6f2612 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -232,7 +232,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, + &hint_byte, 1); BUG_ON(ret); if (isize > actual_end) @@ -241,7 +242,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); - btrfs_drop_extent_cache(inode, start, aligned_end, 0); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -1455,9 +1456,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(!path); path->leave_spinning = 1; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, locked_end, - file_pos, &hint); + file_pos, &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1485,7 +1496,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); - btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -1596,6 +1606,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, + ordered_extent->len); BUG_ON(ret); } unlock_extent(io_tree, ordered_extent->file_offset, @@ -2940,7 +2953,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) cur_offset, cur_offset + hole_size, block_end, - cur_offset, &hint_byte); + cur_offset, &hint_byte, 1); if (err) break; err = btrfs_insert_file_extent(trans, root, @@ -5086,6 +5099,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9f4db848db1..e2d8e90259b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -597,9 +597,8 @@ again: clear_page_dirty_for_io(page); btrfs_set_extent_delalloc(inode, page_start, page_end); - - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); @@ -977,7 +976,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* punch hole in destination first */ btrfs_drop_extents(trans, root, inode, off, off + len, - off + len, 0, &hint_byte); + off + len, 0, &hint_byte, 1); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d91b0de7c50..8661a7381b3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || -- cgit v1.2.3-70-g09d2 From b917b7c3be50435fa8257591b964934e917f2d45 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 18 Sep 2009 16:07:03 -0400 Subject: Btrfs: search for an allocation hint while filling file COW The allocator has some nice knobs for sending hints about where to try and allocate new blocks, but when we're doing file allocations we're not sending any hint at all. This commit adds a simple extent map search to see if we can quickly and easily find a hint for the allocator. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_map.h | 2 ++ fs/btrfs/inode.c | 10 +++++++++- 3 files changed, 59 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5bc7a0d325e..2c726b7b9fa 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -366,6 +366,54 @@ out: return em; } +/** + * search_extent_mapping - find a nearby extent map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + /** * remove_extent_mapping - removes an extent_map from the extent tree * @tree: extent tree to remove from diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d3d442f4bbb..ab6d74b6e64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -60,4 +60,6 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 941f1b71cd2..81ba6654c33 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -726,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); + + read_lock(&BTRFS_I(inode)->extent_tree.lock); + em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, + start, num_bytes); + if (em) { + alloc_hint = em->block_start; + free_extent_map(em); + } + read_unlock(&BTRFS_I(inode)->extent_tree.lock); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@ -738,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode, em = alloc_extent_map(GFP_NOFS); em->start = start; em->orig_start = em->start; - ram_size = ins.offset; em->len = ins.offset; -- cgit v1.2.3-70-g09d2