diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 499 |
1 files changed, 360 insertions, 139 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3efe1c3877b..a684086c3c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache) struct btrfs_caching_control *ctl; spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_STARTED) { - spin_unlock(&cache->lock); - return NULL; - } - - /* We're loading it the fast way, so we don't have a caching_ctl. */ if (!cache->caching_ctl) { spin_unlock(&cache->lock); return NULL; @@ -491,7 +485,7 @@ next: key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + - fs_info->tree_root->leafsize; + fs_info->tree_root->nodesize; else last = key.objectid + key.offset; @@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, spin_unlock(&cache->lock); if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; } else { if (load_cache_only) { cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } } spin_unlock(&cache->lock); + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); @@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } spin_unlock(&cache->lock); wake_up(&caching_ctl->wait); @@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -/* simple helper to search for an existing extent at a given offset */ -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) { int ret; struct btrfs_key key; @@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); - if (ret > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == start && - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; - } btrfs_free_path(path); return ret; } @@ -765,7 +759,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, * different */ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { - offset = root->leafsize; + offset = root->nodesize; metadata = 0; } @@ -786,7 +780,6 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; -again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); if (ret < 0) @@ -799,16 +792,9 @@ again: path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == root->leafsize) + key.offset == root->nodesize) ret = 0; } - if (ret) { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; - btrfs_release_path(path); - goto again; - } } if (ret == 0) { @@ -1903,8 +1889,8 @@ static int btrfs_issue_discard(struct block_device *bdev, return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); } -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) { int ret; u64 discarded_bytes = 0; @@ -2651,7 +2637,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); num_heads = heads_to_leaves(root, num_heads); if (num_heads > 1) - num_bytes += (num_heads - 1) * root->leafsize; + num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; global_rsv = &root->fs_info->global_block_rsv; @@ -3073,10 +3059,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); @@ -3097,7 +3083,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, for (i = 0; i < nritems; i++) { if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); @@ -3117,7 +3103,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); + num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, 1); @@ -3153,9 +3139,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret < 0) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto fail; - BUG_ON(ret); /* Corruption */ + } leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3163,11 +3151,9 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); fail: - if (ret) { + if (ret) btrfs_abort_transaction(trans, root, ret); - return ret; - } - return 0; + return ret; } @@ -3176,7 +3162,19 @@ next_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(root->fs_info, + next_bytenr); + return cache; + } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { @@ -3494,7 +3492,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; @@ -3518,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + INIT_LIST_HEAD(&found->ro_bgs); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -4343,11 +4342,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, } static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info) + struct btrfs_fs_info *fs_info, + int flush_state) { u64 used; spin_lock(&space_info->lock); + /* + * We run out of space and have not got any free space via flush_space, + * so don't bother doing async reclaim. + */ + if (flush_state > COMMIT_TRANS && space_info->full) { + spin_unlock(&space_info->lock); + return 0; + } + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -4380,11 +4389,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + if (!btrfs_need_do_async_reclaim(space_info, fs_info, + flush_state)) return; } while (flush_state <= COMMIT_TRANS); - if (btrfs_need_do_async_reclaim(space_info, fs_info)) + if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) queue_work(system_unbound_wq, work); } @@ -4502,7 +4512,13 @@ again: space_info->flush = 1; } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; - if (need_do_async_reclaim(space_info, root->fs_info, used) && + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!root->fs_info->log_root_recovering && + need_do_async_reclaim(space_info, root->fs_info, used) && !work_busy(&root->fs_info->async_reclaim_work)) queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); @@ -4839,7 +4855,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) if (num_bytes * 3 > meta_used) num_bytes = div64_u64(meta_used, 3); - return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); + return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4988,7 +5004,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * root->leafsize; + num_bytes = 3 * root->nodesize; ret = btrfs_qgroup_reserve(root, num_bytes); if (ret) return ret; @@ -5176,7 +5192,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5185,7 +5201,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (unlikely(ret)) { if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); goto out_fail; } @@ -5301,7 +5317,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { btrfs_qgroup_free(root, num_bytes + - dropped * root->leafsize); + dropped * root->nodesize); } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, @@ -5433,6 +5449,19 @@ static int update_block_group(struct btrfs_root *root, set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + /* + * No longer have used bytes in this block group, queue + * it for deletion. + */ + if (old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } } btrfs_put_block_group(cache); total -= num_bytes; @@ -5698,7 +5727,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, + const bool return_free_space) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; @@ -5722,7 +5752,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (start < cache->last_byte_to_unpin) { len = min(len, cache->last_byte_to_unpin - start); - btrfs_add_free_space(cache, start, len); + if (return_free_space) + btrfs_add_free_space(cache, start, len); } start += len; @@ -5786,7 +5817,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); - unpin_extent_range(root, start, end); + unpin_extent_range(root, start, end, true); cond_resched(); } @@ -6233,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6263,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, - struct btrfs_block_group_cache *cache, - u64 val, u64 num_bytes) -{ - u64 ret = ALIGN(val, root->stripesize); - return ret; -} - /* * when we wait for progress in the block group caching, its because * our allocation attempt failed at least once. So, we must sleep @@ -6464,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool have_caching_bg = false; WARN_ON(num_bytes < root->sectorsize); - btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -6751,8 +6773,7 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, block_group, - offset, num_bytes); + search_start = ALIGN(offset, root->stripesize); /* move on to the next group */ if (search_start + num_bytes > @@ -7077,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); return -ENOMEM; } @@ -7086,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ins, size); if (ret) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); btrfs_free_path(path); return ret; } @@ -7101,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = root->leafsize; + num_bytes = root->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -7131,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->leafsize, 1); + ret = update_block_group(root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); return ret; } @@ -7213,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXENT bit to differentiate dirty pages. */ - if (root->log_transid % 2 == 0) + if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } else { + buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } @@ -7300,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * * returns the tree buffer or NULL. */ -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) @@ -7311,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf; u64 flags = 0; int ret; + u32 blocksize = root->nodesize; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, blocksize, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } -#endif + block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -7417,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); - blocksize = btrfs_level_size(root, wc->level - 1); + blocksize = root->nodesize; for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) @@ -7464,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - ret = readahead_tree_block(root, bytenr, blocksize, - generation); - if (ret) - break; + readahead_tree_block(root, bytenr, blocksize); nread++; } wc->reada_slot = slot; @@ -7626,7 +7646,6 @@ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { - int child_bsize = root->nodesize; int parent_slot; u64 child_gen; u64 child_bytenr; @@ -7638,8 +7657,7 @@ walk_down: child_bytenr = btrfs_node_blockptr(eb, parent_slot); child_gen = btrfs_node_ptr_generation(eb, parent_slot); - eb = read_tree_block(root, child_bytenr, child_bsize, - child_gen); + eb = read_tree_block(root, child_bytenr, child_gen); if (!eb || !extent_buffer_uptodate(eb)) { ret = -EIO; goto out; @@ -7655,7 +7673,7 @@ walk_down: ret = btrfs_qgroup_record_ref(trans, root->fs_info, root->objectid, child_bytenr, - child_bsize, + root->nodesize, BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); if (ret) @@ -7806,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr, blocksize); + next = btrfs_find_tree_block(root, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) @@ -7870,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, blocksize, generation); + next = read_tree_block(root, bytenr, generation); if (!next || !extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; @@ -8507,6 +8525,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro = 1; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } out: @@ -8561,15 +8580,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, /* * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. + * space_info. takes mirrors into account. */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group_cache *block_group; u64 free_bytes = 0; int factor; - list_for_each_entry(block_group, groups_list, list) { + /* It's df, we don't care if it's racey */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); if (!block_group->ro) { @@ -8590,26 +8614,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) spin_unlock(&block_group->lock); } - - return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - int i; - u64 free_bytes = 0; - - spin_lock(&sinfo->lock); - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - if (!list_empty(&sinfo->block_groups[i])) - free_bytes += __btrfs_get_ro_block_group_free_space( - &sinfo->block_groups[i]); - spin_unlock(&sinfo->lock); return free_bytes; @@ -8629,6 +8633,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, cache->bytes_super - btrfs_block_group_used(&cache->item); sinfo->bytes_readonly -= num_bytes; cache->ro = 0; + list_del_init(&cache->ro_list); spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -8853,12 +8858,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } up_write(&info->commit_root_sem); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, cache_node); rb_erase(&block_group->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); spin_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -8987,8 +9003,10 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); return cache; } @@ -9009,7 +9027,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) root = info->extent_root; key.objectid = 0; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -9115,6 +9133,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock(&info->block_group_cache_lock); rb_erase(&cache->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&info->block_group_cache_lock); btrfs_put_block_group(cache); goto error; @@ -9128,8 +9147,18 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) + if (btrfs_chunk_readonly(root, cache->key.objectid)) { set_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + spin_lock(&info->unused_bgs_lock); + /* Should always be true but just in case. */ + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -9170,12 +9199,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret = 0; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, - new_bg_list) { - list_del_init(&block_group->new_bg_list); - + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { if (ret) - continue; + goto next; spin_lock(&block_group->lock); memcpy(&item, &block_group->item, sizeof(item)); @@ -9190,6 +9216,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); } } @@ -9247,6 +9275,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&cache->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_put_block_group(cache); return ret; @@ -9259,7 +9288,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - list_add_tail(&cache->new_bg_list, &trans->new_bgs); + list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -9282,7 +9311,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -9294,6 +9324,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int ret; int index; int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -9378,6 +9410,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); if (root->fs_info->first_logical_byte == block_group->key.objectid) root->fs_info->first_logical_byte = (u64)-1; @@ -9400,12 +9433,37 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } + if (block_group->has_caching_ctl) + caching_ctl = get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&root->fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &root->fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + atomic_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&root->fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + put_caching_control(caching_ctl); + put_caching_control(caching_ctl); + } + } btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; @@ -9413,7 +9471,70 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); - btrfs_clear_space_info_full(root->fs_info); + lock_chunks(root); + if (!list_empty(&em->list)) { + /* We're in the transaction->pending_chunks list. */ + free_extent_map(em); + } + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + /* + * The em might be in the pending_chunks list, so make sure the + * chunk mutex is locked, since remove_extent_mapping() will + * delete us from that list. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + + unlock_chunks(root); btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9430,6 +9551,110 @@ out: return ret; } +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!fs_info->open) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + space_info = block_group->space_info; + list_del_init(&block_group->bg_list); + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || + btrfs_block_group_used(&block_group->item) || + block_group->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = set_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_set_block_group_rw(root, block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY, GFP_NOFS); + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY, GFP_NOFS); + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + block_group->pinned = 0; + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, root, + block_group->key.objectid); +end_trans: + btrfs_end_transaction(trans, root); +next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -9470,13 +9695,7 @@ out: int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - return unpin_extent_range(root, start, end); -} - -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) -{ - return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); + return unpin_extent_range(root, start, end, false); } int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) @@ -9542,12 +9761,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk after the snapshot creation. + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). */ -void btrfs_end_nocow_write(struct btrfs_root *root) +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* @@ -9559,9 +9780,9 @@ void btrfs_end_nocow_write(struct btrfs_root *root) wake_up(&root->subv_writers->wait); } -int btrfs_start_nocow_write(struct btrfs_root *root) +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) { - if (unlikely(atomic_read(&root->will_be_snapshoted))) + if (atomic_read(&root->will_be_snapshoted)) return 0; percpu_counter_inc(&root->subv_writers->counter); @@ -9569,8 +9790,8 @@ int btrfs_start_nocow_write(struct btrfs_root *root) * Make sure counter is updated before we check for snapshot creation. */ smp_mb(); - if (unlikely(atomic_read(&root->will_be_snapshoted))) { - btrfs_end_nocow_write(root); + if (atomic_read(&root->will_be_snapshoted)) { + btrfs_end_write_no_snapshoting(root); return 0; } return 1; |