summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c499
1 files changed, 360 insertions, 139 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3efe1c3877b..a684086c3c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
struct btrfs_caching_control *ctl;
spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_STARTED) {
- spin_unlock(&cache->lock);
- return NULL;
- }
-
- /* We're loading it the fast way, so we don't have a caching_ctl. */
if (!cache->caching_ctl) {
spin_unlock(&cache->lock);
return NULL;
@@ -491,7 +485,7 @@ next:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->leafsize;
+ fs_info->tree_root->nodesize;
else
last = key.objectid + key.offset;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
spin_unlock(&cache->lock);
if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+ mutex_lock(&caching_ctl->mutex);
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
+ caching_ctl->progress = (u64)-1;
} else {
if (load_cache_only) {
cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
}
spin_unlock(&cache->lock);
+ mutex_unlock(&caching_ctl->mutex);
+
wake_up(&caching_ctl->wait);
if (ret == 1) {
put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
spin_unlock(&cache->lock);
wake_up(&caching_ctl->wait);
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
{
int ret;
struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
0, 0);
- if (ret > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == start &&
- key.type == BTRFS_METADATA_ITEM_KEY)
- ret = 0;
- }
btrfs_free_path(path);
return ret;
}
@@ -765,7 +759,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* different
*/
if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->leafsize;
+ offset = root->nodesize;
metadata = 0;
}
@@ -786,7 +780,6 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
ret = btrfs_search_slot(trans, root->fs_info->extent_root,
&key, path, 0, 0);
if (ret < 0)
@@ -799,16 +792,9 @@ again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->leafsize)
+ key.offset == root->nodesize)
ret = 0;
}
- if (ret) {
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = root->leafsize;
- btrfs_release_path(path);
- goto again;
- }
}
if (ret == 0) {
@@ -1903,8 +1889,8 @@ static int btrfs_issue_discard(struct block_device *bdev,
return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
}
-static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes)
{
int ret;
u64 discarded_bytes = 0;
@@ -2651,7 +2637,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
num_heads = heads_to_leaves(root, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
@@ -3073,10 +3059,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
@@ -3097,7 +3083,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
for (i = 0; i < nritems; i++) {
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(buf, i,
struct btrfs_file_extent_item);
@@ -3117,7 +3103,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = btrfs_level_size(root, level - 1);
+ num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
1);
@@ -3153,9 +3139,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
- if (ret < 0)
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
goto fail;
- BUG_ON(ret); /* Corruption */
+ }
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3163,11 +3151,9 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
fail:
- if (ret) {
+ if (ret)
btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
- return 0;
+ return ret;
}
@@ -3176,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct rb_node *node;
+
spin_lock(&root->fs_info->block_group_cache_lock);
+
+ /* If our block group was removed, we need a full search. */
+ if (RB_EMPTY_NODE(&cache->cache_node)) {
+ const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_first_block_group(root->fs_info,
+ next_bytenr);
+ return cache;
+ }
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
if (node) {
@@ -3494,7 +3492,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
if (ret) {
kfree(found);
return ret;
@@ -3518,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->chunk_alloc = 0;
found->flush = 0;
init_waitqueue_head(&found->wait);
+ INIT_LIST_HEAD(&found->ro_bgs);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -4343,11 +4342,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
}
static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info)
+ struct btrfs_fs_info *fs_info,
+ int flush_state)
{
u64 used;
spin_lock(&space_info->lock);
+ /*
+ * We run out of space and have not got any free space via flush_space,
+ * so don't bother doing async reclaim.
+ */
+ if (flush_state > COMMIT_TRANS && space_info->full) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ }
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4380,11 +4389,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+ flush_state))
return;
} while (flush_state <= COMMIT_TRANS);
- if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
queue_work(system_unbound_wq, work);
}
@@ -4502,7 +4512,13 @@ again:
space_info->flush = 1;
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
- if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
!work_busy(&root->fs_info->async_reclaim_work))
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
@@ -4839,7 +4855,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
if (num_bytes * 3 > meta_used)
num_bytes = div64_u64(meta_used, 3);
- return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4988,7 +5004,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->leafsize;
+ num_bytes = 3 * root->nodesize;
ret = btrfs_qgroup_reserve(root, num_bytes);
if (ret)
return ret;
@@ -5176,7 +5192,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5185,7 +5201,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
goto out_fail;
}
@@ -5301,7 +5317,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
btrfs_qgroup_free(root, num_bytes +
- dropped * root->leafsize);
+ dropped * root->nodesize);
}
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5433,6 +5449,19 @@ static int update_block_group(struct btrfs_root *root,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -5698,7 +5727,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
-static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+ const bool return_free_space)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
@@ -5722,7 +5752,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (start < cache->last_byte_to_unpin) {
len = min(len, cache->last_byte_to_unpin - start);
- btrfs_add_free_space(cache, start, len);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
}
start += len;
@@ -5786,7 +5817,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- unpin_extent_range(root, start, end);
+ unpin_extent_range(root, start, end, true);
cond_resched();
}
@@ -6233,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6263,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache,
- u64 val, u64 num_bytes)
-{
- u64 ret = ALIGN(val, root->stripesize);
- return ret;
-}
-
/*
* when we wait for progress in the block group caching, its because
* our allocation attempt failed at least once. So, we must sleep
@@ -6464,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool have_caching_bg = false;
WARN_ON(num_bytes < root->sectorsize);
- btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -6751,8 +6773,7 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, block_group,
- offset, num_bytes);
+ search_start = ALIGN(offset, root->stripesize);
/* move on to the next group */
if (search_start + num_bytes >
@@ -7077,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
return -ENOMEM;
}
@@ -7086,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ins, size);
if (ret) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
btrfs_free_path(path);
return ret;
}
@@ -7101,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = root->leafsize;
+ num_bytes = root->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7131,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+ ret = update_block_group(root, ins->objectid, root->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
return ret;
}
@@ -7213,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
* EXENT bit to differentiate dirty pages.
*/
- if (root->log_transid % 2 == 0)
+ if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
} else {
+ buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
@@ -7300,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
*
* returns the tree buffer or NULL.
*/
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size)
@@ -7311,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf;
u64 flags = 0;
int ret;
+ u32 blocksize = root->nodesize;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
blocksize, level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
-#endif
+
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
@@ -7417,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
- blocksize = btrfs_level_size(root, wc->level - 1);
+ blocksize = root->nodesize;
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
@@ -7464,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- ret = readahead_tree_block(root, bytenr, blocksize,
- generation);
- if (ret)
- break;
+ readahead_tree_block(root, bytenr, blocksize);
nread++;
}
wc->reada_slot = slot;
@@ -7626,7 +7646,6 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
- int child_bsize = root->nodesize;
int parent_slot;
u64 child_gen;
u64 child_bytenr;
@@ -7638,8 +7657,7 @@ walk_down:
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- eb = read_tree_block(root, child_bytenr, child_bsize,
- child_gen);
+ eb = read_tree_block(root, child_bytenr, child_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = -EIO;
goto out;
@@ -7655,7 +7673,7 @@ walk_down:
ret = btrfs_qgroup_record_ref(trans, root->fs_info,
root->objectid,
child_bytenr,
- child_bsize,
+ root->nodesize,
BTRFS_QGROUP_OPER_SUB_SUBTREE,
0);
if (ret)
@@ -7806,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_tree_block(root, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!next)
@@ -7870,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, blocksize, generation);
+ next = read_tree_block(root, bytenr, generation);
if (!next || !extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
@@ -8507,6 +8525,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro = 1;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
out:
@@ -8561,15 +8580,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
/*
* helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
*/
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group_cache *block_group;
u64 free_bytes = 0;
int factor;
- list_for_each_entry(block_group, groups_list, list) {
+ /* It's df, we don't care if it's racey */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
@@ -8590,26 +8614,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
spin_unlock(&block_group->lock);
}
-
- return free_bytes;
-}
-
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
- int i;
- u64 free_bytes = 0;
-
- spin_lock(&sinfo->lock);
-
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- if (!list_empty(&sinfo->block_groups[i]))
- free_bytes += __btrfs_get_ro_block_group_free_space(
- &sinfo->block_groups[i]);
-
spin_unlock(&sinfo->lock);
return free_bytes;
@@ -8629,6 +8633,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
cache->bytes_super - btrfs_block_group_used(&cache->item);
sinfo->bytes_readonly -= num_bytes;
cache->ro = 0;
+ list_del_init(&cache->ro_list);
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -8853,12 +8858,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
up_write(&info->commit_root_sem);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
cache_node);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
spin_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
@@ -8987,8 +9003,10 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
btrfs_init_free_space_ctl(cache);
+ atomic_set(&cache->trimming, 0);
return cache;
}
@@ -9009,7 +9027,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
root = info->extent_root;
key.objectid = 0;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -9115,6 +9133,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
spin_lock(&info->block_group_cache_lock);
rb_erase(&cache->cache_node,
&info->block_group_cache_tree);
+ RB_CLEAR_NODE(&cache->cache_node);
spin_unlock(&info->block_group_cache_lock);
btrfs_put_block_group(cache);
goto error;
@@ -9128,8 +9147,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid))
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
set_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9170,12 +9199,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
- new_bg_list) {
- list_del_init(&block_group->new_bg_list);
-
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
if (ret)
- continue;
+ goto next;
spin_lock(&block_group->lock);
memcpy(&item, &block_group->item, sizeof(item));
@@ -9190,6 +9216,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+next:
+ list_del_init(&block_group->bg_list);
}
}
@@ -9247,6 +9275,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&cache->cache_node,
&root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&cache->cache_node);
spin_unlock(&root->fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
return ret;
@@ -9259,7 +9288,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -9282,7 +9311,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start)
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em)
{
struct btrfs_path *path;
struct btrfs_block_group_cache *block_group;
@@ -9294,6 +9324,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int ret;
int index;
int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
root = root->fs_info->extent_root;
@@ -9378,6 +9410,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
if (root->fs_info->first_logical_byte == block_group->key.objectid)
root->fs_info->first_logical_byte = (u64)-1;
@@ -9400,12 +9433,37 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
+ if (block_group->has_caching_ctl)
+ caching_ctl = get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&root->fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &root->fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ atomic_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&root->fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ put_caching_control(caching_ctl);
+ put_caching_control(caching_ctl);
+ }
+ }
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
+ list_del_init(&block_group->ro_list);
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
block_group->space_info->disk_total -= block_group->key.offset * factor;
@@ -9413,7 +9471,70 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- btrfs_clear_space_info_full(root->fs_info);
+ lock_chunks(root);
+ if (!list_empty(&em->list)) {
+ /* We're in the transaction->pending_chunks list. */
+ free_extent_map(em);
+ }
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ /*
+ * Make sure a trimmer task always sees the em in the pinned_chunks list
+ * if it sees block_group->removed == 1 (needs to lock block_group->lock
+ * before checking block_group->removed).
+ */
+ if (!remove_em) {
+ /*
+ * Our em might be in trans->transaction->pending_chunks which
+ * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+ * and so is the fs_info->pinned_chunks list.
+ *
+ * So at this point we must be holding the chunk_mutex to avoid
+ * any races with chunk allocation (more specifically at
+ * volumes.c:contains_pending_extent()), to ensure it always
+ * sees the em, either in the pending_chunks list or in the
+ * pinned_chunks list.
+ */
+ list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+ }
+ spin_unlock(&block_group->lock);
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ /*
+ * The em might be in the pending_chunks list, so make sure the
+ * chunk mutex is locked, since remove_extent_mapping() will
+ * delete us from that list.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+
+ unlock_chunks(root);
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9430,6 +9551,110 @@ out:
return ret;
}
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ space_info = block_group->space_info;
+ list_del_init(&block_group->bg_list);
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = set_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_set_block_group_rw(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ block_group->pinned = 0;
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+end_trans:
+ btrfs_end_transaction(trans, root);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -9470,13 +9695,7 @@ out:
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- return unpin_extent_range(root, start, end);
-}
-
-int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
-{
- return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+ return unpin_extent_range(root, start, end, false);
}
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
@@ -9542,12 +9761,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
}
/*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk after the snapshot creation.
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
*/
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
@@ -9559,9 +9780,9 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
wake_up(&root->subv_writers->wait);
}
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
{
- if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ if (atomic_read(&root->will_be_snapshoted))
return 0;
percpu_counter_inc(&root->subv_writers->counter);
@@ -9569,8 +9790,8 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
* Make sure counter is updated before we check for snapshot creation.
*/
smp_mb();
- if (unlikely(atomic_read(&root->will_be_snapshoted))) {
- btrfs_end_nocow_write(root);
+ if (atomic_read(&root->will_be_snapshoted)) {
+ btrfs_end_write_no_snapshoting(root);
return 0;
}
return 1;