summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c583
1 files changed, 473 insertions, 110 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813537f362f..47c1ba14108 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->leafsize;
+ fs_info->tree_root->nodesize;
else
last = key.objectid + key.offset;
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
atomic_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+ caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -709,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
{
int ret;
struct btrfs_key key;
@@ -725,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
0, 0);
- if (ret > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == start &&
- key.type == BTRFS_METADATA_ITEM_KEY)
- ret = 0;
- }
btrfs_free_path(path);
return ret;
}
@@ -764,7 +759,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* different
*/
if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->leafsize;
+ offset = root->nodesize;
metadata = 0;
}
@@ -785,7 +780,6 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
ret = btrfs_search_slot(trans, root->fs_info->extent_root,
&key, path, 0, 0);
if (ret < 0)
@@ -798,16 +792,9 @@ again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->leafsize)
+ key.offset == root->nodesize)
ret = 0;
}
- if (ret) {
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = root->leafsize;
- btrfs_release_path(path);
- goto again;
- }
}
if (ret == 0) {
@@ -2650,7 +2637,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
num_heads = heads_to_leaves(root, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
@@ -2749,8 +2736,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
async->sync = 0;
init_completion(&async->wait);
- btrfs_init_work(&async->work, delayed_ref_async_start,
- NULL, NULL);
+ btrfs_init_work(&async->work, btrfs_extent_refs_helper,
+ delayed_ref_async_start, NULL, NULL);
btrfs_queue_work(root->fs_info->extent_workers, &async->work);
@@ -3057,7 +3044,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc, int no_quota)
+ int full_backref, int inc)
{
u64 bytenr;
u64 num_bytes;
@@ -3072,10 +3059,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
@@ -3096,7 +3083,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
for (i = 0; i < nritems; i++) {
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(buf, i,
struct btrfs_file_extent_item);
@@ -3111,15 +3098,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, no_quota);
+ key.offset, 1);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = btrfs_level_size(root, level - 1);
+ num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
- no_quota);
+ 1);
if (ret)
goto fail;
}
@@ -3130,15 +3117,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota)
+ struct extent_buffer *buf, int full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota)
+ struct extent_buffer *buf, int full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3493,7 +3480,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
if (ret) {
kfree(found);
return ret;
@@ -3586,13 +3573,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
*/
static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
- /*
- * we add in the count of missing devices because we want
- * to make sure that any RAID levels on a degraded FS
- * continue to be honored.
- */
- u64 num_devices = root->fs_info->fs_devices->rw_devices +
- root->fs_info->fs_devices->missing_devices;
+ u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
u64 tmp;
@@ -4348,11 +4329,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
}
static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info)
+ struct btrfs_fs_info *fs_info,
+ int flush_state)
{
u64 used;
spin_lock(&space_info->lock);
+ /*
+ * We run out of space and have not got any free space via flush_space,
+ * so don't bother doing async reclaim.
+ */
+ if (flush_state > COMMIT_TRANS && space_info->full) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ }
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4385,11 +4376,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+ flush_state))
return;
} while (flush_state <= COMMIT_TRANS);
- if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
queue_work(system_unbound_wq, work);
}
@@ -4507,7 +4499,13 @@ again:
space_info->flush = 1;
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
- if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
!work_busy(&root->fs_info->async_reclaim_work))
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
@@ -4844,7 +4842,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
if (num_bytes * 3 > meta_used)
num_bytes = div64_u64(meta_used, 3);
- return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4993,7 +4991,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->leafsize;
+ num_bytes = 3 * root->nodesize;
ret = btrfs_qgroup_reserve(root, num_bytes);
if (ret)
return ret;
@@ -5181,7 +5179,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5190,7 +5188,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
goto out_fail;
}
@@ -5306,7 +5304,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
btrfs_qgroup_free(root, num_bytes +
- dropped * root->leafsize);
+ dropped * root->nodesize);
}
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5427,6 +5425,20 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
+
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes;
cache->space_info->bytes_pinned += num_bytes;
@@ -6238,10 +6250,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6268,14 +6279,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache,
- u64 val, u64 num_bytes)
-{
- u64 ret = ALIGN(val, root->stripesize);
- return ret;
-}
-
/*
* when we wait for progress in the block group caching, its because
* our allocation attempt failed at least once. So, we must sleep
@@ -6469,7 +6472,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool have_caching_bg = false;
WARN_ON(num_bytes < root->sectorsize);
- btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -6756,8 +6759,7 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, block_group,
- offset, num_bytes);
+ search_start = ALIGN(offset, root->stripesize);
/* move on to the next group */
if (search_start + num_bytes >
@@ -7082,7 +7084,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
return -ENOMEM;
}
@@ -7091,7 +7093,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ins, size);
if (ret) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
btrfs_free_path(path);
return ret;
}
@@ -7106,7 +7108,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = root->leafsize;
+ num_bytes = root->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7136,14 +7138,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+ ret = update_block_group(root, ins->objectid, root->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
return ret;
}
@@ -7218,17 +7220,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
* EXENT bit to differentiate dirty pages.
*/
- if (root->log_transid % 2 == 0)
+ if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
} else {
+ buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
@@ -7305,8 +7309,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
*
* returns the tree buffer or NULL.
*/
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size)
@@ -7316,18 +7320,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf;
u64 flags = 0;
int ret;
+ u32 blocksize = root->nodesize;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
blocksize, level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
-#endif
+
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
@@ -7422,7 +7426,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
- blocksize = btrfs_level_size(root, wc->level - 1);
+ blocksize = root->nodesize;
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
@@ -7469,15 +7473,224 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- ret = readahead_tree_block(root, bytenr, blocksize,
- generation);
- if (ret)
- break;
+ readahead_tree_block(root, bytenr, blocksize);
nread++;
}
wc->reada_slot = slot;
}
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ int nr = btrfs_header_nritems(eb);
+ int i, extent_type, ret;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 bytenr, num_bytes;
+
+ for (i = 0; i < nr; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ /* filter out non qgroup-accountable extents */
+ extent_type = btrfs_file_extent_type(eb, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+
+ bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (!bytenr)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ bytenr, num_bytes,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+ struct btrfs_path *path, int root_level)
+{
+ int level = 0;
+ int nr, slot;
+ struct extent_buffer *eb;
+
+ if (root_level == 0)
+ return 1;
+
+ while (level <= root_level) {
+ eb = path->nodes[level];
+ nr = btrfs_header_nritems(eb);
+ path->slots[level]++;
+ slot = path->slots[level];
+ if (slot >= nr || level == 0) {
+ /*
+ * Don't free the root - we will detect this
+ * condition after our loop and return a
+ * positive value for caller to stop walking the tree.
+ */
+ if (level != root_level) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+
+ free_extent_buffer(eb);
+ path->nodes[level] = NULL;
+ path->slots[level] = 0;
+ }
+ } else {
+ /*
+ * We have a valid slot to walk back down
+ * from. Stop here so caller can process these
+ * new nodes.
+ */
+ break;
+ }
+
+ level++;
+ }
+
+ eb = path->nodes[root_level];
+ if (path->slots[root_level] >= btrfs_header_nritems(eb))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *root_eb,
+ u64 root_gen,
+ int root_level)
+{
+ int ret = 0;
+ int level;
+ struct extent_buffer *eb = root_eb;
+ struct btrfs_path *path = NULL;
+
+ BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+ BUG_ON(root_eb == NULL);
+
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
+ if (!extent_buffer_uptodate(root_eb)) {
+ ret = btrfs_read_buffer(root_eb, root_gen);
+ if (ret)
+ goto out;
+ }
+
+ if (root_level == 0) {
+ ret = account_leaf_items(trans, root, root_eb);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Walk down the tree. Missing extent blocks are filled in as
+ * we go. Metadata is accounted every time we read a new
+ * extent block.
+ *
+ * When we reach a leaf, we account for file extent items in it,
+ * walk back up the tree (adjusting slot pointers as we go)
+ * and restart the search process.
+ */
+ extent_buffer_get(root_eb); /* For path */
+ path->nodes[root_level] = root_eb;
+ path->slots[root_level] = 0;
+ path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+ level = root_level;
+ while (level >= 0) {
+ if (path->nodes[level] == NULL) {
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /* We need to get child blockptr/gen from
+ * parent before we can read it. */
+ eb = path->nodes[level + 1];
+ parent_slot = path->slots[level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+ eb = read_tree_block(root, child_bytenr, child_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ child_bytenr,
+ root->nodesize,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
+ 0);
+ if (ret)
+ goto out;
+
+ }
+
+ if (level == 0) {
+ ret = account_leaf_items(trans, root, path->nodes[level]);
+ if (ret)
+ goto out;
+
+ /* Nonzero return here means we completed our search */
+ ret = adjust_slots_upwards(root, path, root_level);
+ if (ret)
+ break;
+
+ /* Restart search with new slots */
+ goto walk_down;
+ }
+
+ level--;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* helper to process tree block while walking down the tree.
*
@@ -7532,9 +7745,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag,
@@ -7581,6 +7794,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
int level = wc->level;
int reada = 0;
int ret = 0;
+ bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -7596,9 +7810,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_tree_block(root, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!next)
@@ -7626,6 +7840,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level - 1] > 1) {
+ need_account = true;
if (level == 1 &&
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
goto skip;
@@ -7659,7 +7874,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, blocksize, generation);
+ next = read_tree_block(root, bytenr, generation);
if (!next || !extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
@@ -7689,6 +7904,16 @@ skip:
parent = 0;
}
+ if (need_account) {
+ ret = account_shared_subtree(trans, root, next,
+ generation, level - 1);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting shared subtree. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
+ }
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0, 0);
BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +7994,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = btrfs_dec_ref(trans, root, eb, 1,
- wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 1);
else
- ret = btrfs_dec_ref(trans, root, eb, 0,
- wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
+ ret = account_leaf_items(trans, root, eb);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting leaf items. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
@@ -7900,6 +8130,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
+ btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
+
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -8025,6 +8257,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
+ /*
+ * Qgroup update accounting is run from
+ * delayed ref handling. This usually works
+ * out because delayed refs are normally the
+ * only way qgroup updates are added. However,
+ * we may have added updates during our tree
+ * walk so run qgroups here to make sure we
+ * don't lose any updates.
+ */
+ ret = btrfs_delayed_qgroup_accounting(trans,
+ root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8328,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
+ ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -8181,13 +8439,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (stripped)
return extended_to_chunk(stripped);
- /*
- * we add in the count of missing devices because we want
- * to make sure that any RAID levels on a degraded FS
- * continue to be honored.
- */
- num_devices = root->fs_info->fs_devices->rw_devices +
- root->fs_info->fs_devices->missing_devices;
+ num_devices = root->fs_info->fs_devices->rw_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
@@ -8605,6 +8857,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
up_write(&info->commit_root_sem);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8739,7 +9001,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
return cache;
@@ -8761,7 +9023,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
root = info->extent_root;
key.objectid = 0;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -8880,8 +9142,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid))
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
set_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -8922,10 +9194,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
- new_bg_list) {
- list_del_init(&block_group->new_bg_list);
-
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ list_del_init(&block_group->bg_list);
if (ret)
continue;
@@ -9011,7 +9281,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -9165,8 +9435,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- btrfs_clear_space_info_full(root->fs_info);
-
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9182,6 +9450,101 @@ out:
return ret;
}
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ space_info = block_group->space_info;
+ list_del_init(&block_group->bg_list);
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = set_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_set_block_group_rw(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ block_group->pinned = 0;
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+ btrfs_end_transaction(trans, root);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -9313,7 +9676,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
int btrfs_start_nocow_write(struct btrfs_root *root)
{
- if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ if (atomic_read(&root->will_be_snapshoted))
return 0;
percpu_counter_inc(&root->subv_writers->counter);
@@ -9321,7 +9684,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
* Make sure counter is updated before we check for snapshot creation.
*/
smp_mb();
- if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ if (atomic_read(&root->will_be_snapshoted)) {
btrfs_end_nocow_write(root);
return 0;
}