From 53b381b3abeb86f12787a6c40fee9b2f71edc23b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 29 Jan 2013 18:40:14 -0500 Subject: Btrfs: RAID5 and RAID6 This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 88 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 29 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d133edfcd44..3345f68fc64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "print-tree.h" #include "transaction.h" #include "volumes.h" +#include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" @@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, *actual_bytes = discarded_bytes; + if (ret == -EOPNOTSUPP) + ret = 0; return ret; } @@ -3276,6 +3279,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; u64 target; + u64 tmp; /* * see if restripe for this chunk_type is in progress, if so @@ -3292,30 +3296,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) } spin_unlock(&root->fs_info->balance_lock); + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } - - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - return extended_to_chunk(flags); + return extended_to_chunk(flags | tmp); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) @@ -3333,6 +3339,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; + u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; @@ -3341,7 +3348,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - return get_alloc_profile(root, flags); + ret = get_alloc_profile(root, flags); + return ret; } /* @@ -3516,8 +3524,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) { u64 num_dev; - if (type & BTRFS_BLOCK_GROUP_RAID10 || - type & BTRFS_BLOCK_GROUP_RAID0) + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) num_dev = root->fs_info->fs_devices->rw_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1) num_dev = 2; @@ -3667,7 +3677,9 @@ static int can_overcommit(struct btrfs_root *root, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math */ if (profile & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | @@ -5455,10 +5467,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 val, u64 num_bytes) { - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; + u64 mask; + u64 ret; + mask = ((u64)root->stripesize - 1); + ret = (val + mask) & ~mask; return ret; } @@ -5519,9 +5535,12 @@ int __get_raid_index(u64 flags) index = 2; else if (flags & BTRFS_BLOCK_GROUP_RAID0) index = 3; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + index = 5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + index = 6; else - index = 4; - + index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */ return index; } @@ -5665,6 +5684,8 @@ search: if (!block_group_bits(block_group, data)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; /* @@ -5835,7 +5856,8 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, offset); + search_start = stripe_align(root, used_block_group, + offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > @@ -7203,6 +7225,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) root->fs_info->fs_devices->missing_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { @@ -7754,7 +7777,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_release_path(path); cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + found_key.objectid); btrfs_init_free_space_ctl(cache); /* @@ -7808,6 +7833,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!(get_alloc_profile(root, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) continue; /* @@ -7883,6 +7910,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = root->sectorsize; cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + chunk_offset); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); -- cgit v1.2.3-70-g09d2 From 8de972b4faa4be9b2a3c53103e18d86092a5da45 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 4 Jan 2013 15:39:43 -0500 Subject: Btrfs: fix cluster alignment for mount -o ssd With the new raid56 code, we want to make sure we're properly aligning our allocation clusters with -o ssd Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3345f68fc64..f13402104c9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5715,6 +5715,7 @@ have_block_group: * lets look there */ if (last_ptr) { + unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster @@ -5781,11 +5782,15 @@ refill_cluster: goto unclustered_alloc; } + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, search_start, num_bytes, - empty_cluster + empty_size); + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this -- cgit v1.2.3-70-g09d2 From bb721703aa551e98dc5c7fb259cf90343408baf2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 29 Jan 2013 18:44:12 -0500 Subject: Btrfs: reduce CPU contention while waiting for delayed extent operations We batch up operations to the extent allocation tree, which allows us to deal with the recursive nature of using the extent allocation tree to allocate extents to the extent allocation tree. It also provides a mechanism to sort and collect extent operations, which makes it much more efficient to record extents that are close together. The delayed extent operations must all be finished before the running transaction commits, so we have code to make sure and run a few of the batched operations when closing our transaction handles. This creates a great deal of contention for the locks in the delayed extent operation tree, and also contention for the lock on the extent allocation tree itself. All the extra contention just slows down the operations and doesn't get things done any faster. This commit changes things to use a wait queue instead. As procs want to run the delayed operations, one of them races in and gets permission to hit the tree, and the others step back and wait for progress to be made. Signed-off-by: Chris Mason --- fs/btrfs/delayed-ref.h | 9 ++++++++ fs/btrfs/extent-tree.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/transaction.c | 6 ++++- 3 files changed, 70 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df..23bdeb8502a 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -131,6 +131,15 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + /* + * bumped when someone is making progress on the delayed + * refs, so that other procs know they are just adding to + * contention intead of helping + */ + atomic_t procs_running_refs; + atomic_t ref_seq; + wait_queue_head_t wait; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f13402104c9..87b0e856b6d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2438,6 +2438,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, + int count) +{ + int val = atomic_read(&delayed_refs->ref_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2472,6 +2482,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + + if (!run_all && !run_most) { + int old; + int seq = atomic_read(&delayed_refs->ref_seq); + +progress: + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + DEFINE_WAIT(__wait); + if (delayed_refs->num_entries < 16348) + return 0; + + prepare_to_wait(&delayed_refs->wait, &__wait, + TASK_UNINTERRUPTIBLE); + + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + schedule(); + finish_wait(&delayed_refs->wait, &__wait); + + if (!refs_newer(delayed_refs, seq, 256)) + goto progress; + else + return 0; + } else { + finish_wait(&delayed_refs->wait, &__wait); + goto again; + } + } + + } else { + atomic_inc(&delayed_refs->procs_running_refs); + } + again: loops = 0; spin_lock(&delayed_refs->lock); @@ -2480,10 +2528,6 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } while (1) { if (!(run_all || run_most) && delayed_refs->num_heads_ready < 64) @@ -2505,9 +2549,12 @@ again: if (ret < 0) { spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); + atomic_dec(&delayed_refs->procs_running_refs); return ret; } + atomic_add(ret, &delayed_refs->ref_seq); + count -= min_t(unsigned long, ret, count); if (count == 0) @@ -2576,6 +2623,11 @@ again: goto again; } out: + atomic_dec(&delayed_refs->procs_running_refs); + smp_mb(); + if (waitqueue_active(&delayed_refs->wait)) + wake_up(&delayed_refs->wait); + spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a065dec0e33..1e7f176bd21 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -156,6 +156,9 @@ loop: spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); + atomic_set(&cur_trans->delayed_refs.ref_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -577,7 +580,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 2) { + while (count < 1) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && @@ -589,6 +592,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; -- cgit v1.2.3-70-g09d2 From 78a6184a3ff9041280ee56273c01e5679a831b39 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 21 Nov 2012 02:21:28 +0000 Subject: Btrfs: use slabs for delayed reference allocation The delayed reference allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie --- fs/btrfs/delayed-ref.c | 74 ++++++++++++++++++++++++++++++++++++++++++-------- fs/btrfs/delayed-ref.h | 37 ++++++++++++++++++++++++- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 14 +++++----- fs/btrfs/super.c | 9 +++++- 5 files changed, 115 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae941177339..455894f1ca3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -23,6 +23,10 @@ #include "delayed-ref.h" #include "transaction.h" +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for @@ -511,7 +515,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref->extent_op->flags_to_set; existing_ref->extent_op->update_flags = 1; } - kfree(ref->extent_op); + btrfs_free_delayed_extent_op(ref->extent_op); } } /* @@ -592,7 +596,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(head_ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -653,7 +657,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -714,7 +718,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -738,13 +742,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); return -ENOMEM; } @@ -786,13 +790,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); return -ENOMEM; } @@ -826,7 +830,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; @@ -860,3 +864,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) return btrfs_delayed_node_to_head(ref); return NULL; } + +void btrfs_delayed_ref_exit(void) +{ + if (btrfs_delayed_ref_head_cachep) + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + if (btrfs_delayed_tree_ref_cachep) + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + if (btrfs_delayed_data_ref_cachep) + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + if (btrfs_delayed_extent_op_cachep) + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; + + return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df..fe50392cdf7 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -141,12 +141,47 @@ struct btrfs_delayed_ref_root { u64 run_delayed_start; }; +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(atomic_read(&ref->refs) == 0); if (atomic_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); - kfree(ref); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + case 0: + kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); + break; + default: + BUG(); + } } } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f03670a95..4438aac4947 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3614,7 +3614,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61da9d0bb80..9bd87f0d73d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2285,7 +2285,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = &locked_ref->node; if (extent_op && must_insert_reserved) { - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); extent_op = NULL; } @@ -2294,7 +2294,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); if (ret) { list_del_init(&locked_ref->cluster); @@ -2338,7 +2338,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, must_insert_reserved); btrfs_put_delayed_ref(ref); - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); count++; if (ret) { @@ -2586,7 +2586,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op; int ret; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; @@ -2598,7 +2598,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); if (ret) - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); return ret; } @@ -5330,7 +5330,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->extent_op) { if (!head->must_insert_reserved) goto out; - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); head->extent_op = NULL; } @@ -6400,7 +6400,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9601d..67b373bf3ff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1684,10 +1684,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_inode; - err = btrfs_interface_init(); + err = btrfs_delayed_ref_init(); if (err) goto free_auto_defrag; + err = btrfs_interface_init(); + if (err) + goto free_delayed_ref; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1699,6 +1703,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_delayed_ref: + btrfs_delayed_ref_exit(); free_auto_defrag: btrfs_auto_defrag_exit(); free_delayed_inode: @@ -1720,6 +1726,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); -- cgit v1.2.3-70-g09d2 From 093486c453a55230ccdad4b48863b872fe68c46e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Dec 2012 08:10:10 +0000 Subject: Btrfs: make delayed ref lock logic more readable Locking and unlocking delayed ref mutex are in the different functions, and the name of lock functions is not uniform, so the readability is not so good, this patch optimizes the lock logic and makes it more readable. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/delayed-ref.c | 8 ++++++++ fs/btrfs/delayed-ref.h | 6 ++++++ fs/btrfs/extent-tree.c | 42 ++++++++++++++++++++++++------------------ 3 files changed, 38 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 455894f1ca3..b7a0641ead7 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -426,6 +426,14 @@ again: return 1; } +void btrfs_release_ref_cluster(struct list_head *cluster) +{ + struct list_head *pos, *q; + + list_for_each_safe(pos, q, cluster) + list_del_init(pos); +} + /* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index fe50392cdf7..7939149f8f2 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -211,8 +211,14 @@ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); +void btrfs_release_ref_cluster(struct list_head *cluster); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9bd87f0d73d..b4cb8186035 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2143,7 +2143,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } - mutex_unlock(&head->mutex); return ret; } @@ -2258,7 +2257,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * process of being added. Don't run this ref yet. */ list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); @@ -2297,25 +2296,22 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - - printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + printk(KERN_DEBUG + "btrfs: run_delayed_extent_op " + "returned %d\n", ret); spin_lock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); return ret; } goto next; } - - list_del_init(&locked_ref->cluster); - locked_ref = NULL; } ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (locked_ref) { + if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the * ref_mod on head @@ -2337,20 +2333,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - btrfs_put_delayed_ref(ref); btrfs_free_delayed_extent_op(extent_op); - count++; - if (ret) { - if (locked_ref) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - } - printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + printk(KERN_DEBUG + "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; } + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + list_del_init(&locked_ref->cluster); + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } + btrfs_put_delayed_ref(ref); + count++; next: cond_resched(); spin_lock(&delayed_refs->lock); @@ -2500,6 +2505,7 @@ again: ret = run_clustered_refs(trans, root, &cluster); if (ret < 0) { + btrfs_release_ref_cluster(&cluster); spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); return ret; -- cgit v1.2.3-70-g09d2 From da633a42170165cbf20a2d3886c7480ccc832ec3 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Dec 2012 11:19:09 +0000 Subject: Btrfs: flush all dirty inodes if writeback can not start We may try to flush some dirty pages when there is no enough space to reserve. But it is possible that this operation fails, in order to get enough space to reserve successfully, we will sync all the delalloc file. This operation is safe, we needn't worry about the case that the filesystem goes from r/w to r/o. because the filesystem should guarantee all the dirty pages have been written into the disk after it becomes readonly, so the sync operation will do nothing if the filesystem is already readonly. Though it may waste lots of time, as a corner case, we needn't care. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b4cb8186035..d5e60d25ca5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3695,12 +3695,15 @@ static int can_overcommit(struct btrfs_root *root, return 0; } -static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, - unsigned long nr_pages, - enum wb_reason reason) +static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, + unsigned long nr_pages, + enum wb_reason reason) { - if (!writeback_in_progress(sb->s_bdi) && - down_read_trylock(&sb->s_umount)) { + /* the flusher is dealing with the dirty inodes now. */ + if (writeback_in_progress(sb->s_bdi)) + return 1; + + if (down_read_trylock(&sb->s_umount)) { writeback_inodes_sb_nr(sb, nr_pages, reason); up_read(&sb->s_umount); return 1; @@ -3709,6 +3712,28 @@ static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, return 0; } +void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages) +{ + struct super_block *sb = root->fs_info->sb; + int started; + + /* If we can not start writeback, just sync all the delalloc file. */ + started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages, + WB_REASON_FS_FREE_SPACE); + if (!started) { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + } +} + /* * shrink metadata reservation for delalloc */ @@ -3741,10 +3766,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, - nr_pages, - WB_REASON_FS_FREE_SPACE); - + btrfs_writeback_inodes_sb_nr(root, nr_pages); /* * We need to wait for the async pages to actually start before * we do anything. -- cgit v1.2.3-70-g09d2 From c6b305a89b1903d63652691ad5eb9f05aa0326b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 09:16:16 -0500 Subject: Btrfs: don't re-enter when allocating a chunk If we start running low on metadata space we will try to allocate a chunk, which could then try to allocate a chunk to add the device entry. The thing is we allocate a chunk before we try really hard to make the allocation, so we should be able to find space for the device entry. Add a flag to the trans handle so we know we're currently allocating a chunk so we can just bail out if we try to allocate another chunk. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 7 +++++++ fs/btrfs/transaction.c | 1 + fs/btrfs/transaction.h | 1 + 3 files changed, 9 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d5e60d25ca5..c642fc2cfc2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3570,6 +3570,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, @@ -3612,6 +3616,8 @@ again: goto again; } + trans->allocating_chunk = true; + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3638,6 +3644,7 @@ again: check_system_chunk(trans, extent_root, flags); ret = btrfs_alloc_chunk(trans, extent_root, flags); + trans->allocating_chunk = false; if (ret < 0 && ret != -ENOSPC) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 86bb105b398..24fde97cba8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -385,6 +385,7 @@ again: h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; h->type = type; + h->allocating_chunk = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e6c28..69700f7b20a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -68,6 +68,7 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; short aborted; short adding_csums; + bool allocating_chunk; enum btrfs_trans_type type; /* * this root is only needed to validate that the root passed to -- cgit v1.2.3-70-g09d2 From 17b85495cf87ac9b2d9e0878f62fa9f3ae630a6e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:17 +0000 Subject: Btrfs: remove deprecated comments commit d53ba47484ed6245e640ee4bfe9d21e9bfc15765 (Btrfs: use commit root when loading free space cache) has remove the deadlock check, and the related comments can be removed as well. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c642fc2cfc2..a4a062a0a79 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -527,12 +527,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - /* - * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. Also if we are currently trying to - * allocate blocks for the tree root we can't do the fast caching since - * we likely hold important locks. - */ if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); -- cgit v1.2.3-70-g09d2 From f6373bf3dc6b78cd2c8cfba0215104003ba4a1d0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:18 +0000 Subject: Btrfs: kill unused arguments of cache_block_group Argument 'trans' and 'root' are not used any more. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a4a062a0a79..a9dc12d5ac1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -468,8 +468,6 @@ out: } static int cache_block_group(struct btrfs_block_group_cache *cache, - struct btrfs_trans_handle *trans, - struct btrfs_root *root, int load_cache_only) { DEFINE_WAIT(wait); @@ -4802,7 +4800,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, NULL, 1); + cache_block_group(cache, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4917,7 +4915,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, trans, root, 1); + cache_block_group(cache, 1); pin_down_extent(root, cache, bytenr, num_bytes, 0); @@ -5707,8 +5705,7 @@ have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { found_uncached_bg = true; - ret = cache_block_group(block_group, trans, - orig_root, 0); + ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; } @@ -6244,7 +6241,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, NULL, 0); + cache_block_group(block_group, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -8187,7 +8184,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, NULL, root, 0); + ret = cache_block_group(cache, 0); if (!ret) wait_block_group_cache_done(cache); } -- cgit v1.2.3-70-g09d2 From c53d613e5293ef248e42195bb0595502b646a5a6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:19 +0000 Subject: Btrfs: kill unused argument of update_block_group Argument 'trans' is not used any more. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a9dc12d5ac1..a162c7cb8fb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -72,8 +72,7 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4762,8 +4761,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; @@ -5312,7 +5310,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(trans, root, bytenr, num_bytes, 0); + ret = update_block_group(root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -6134,7 +6132,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6198,7 +6196,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, -- cgit v1.2.3-70-g09d2 From dcfac4156fa102c1bab0e4e31df37e47278292f6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:20 +0000 Subject: Btrfs: kill unused argument of btrfs_pin_extent_for_log_replay Argument 'trans' is not used any more. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent-tree.c | 3 +-- fs/btrfs/tree-log.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 411c8d97074..22f012d41fd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2939,8 +2939,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a162c7cb8fb..825f23b13b5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4898,8 +4898,7 @@ int btrfs_pin_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7de720d22b7..deb7f66356b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent_for_log_replay(wc->trans, - log->fs_info->extent_root, + btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen, 0)) { -- cgit v1.2.3-70-g09d2 From a1897fddd28daf6b23d05a30dc2a18836f77f8e3 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:23 +0000 Subject: Btrfs: record first logical byte in memory This'd save us a rbtree search which may become expensive in large filesystem. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 19 ++++++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 22f012d41fd..9ee099f3f83 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1250,6 +1250,7 @@ struct btrfs_fs_info { /* block group cache stuff */ spinlock_t block_group_cache_lock; + u64 first_logical_byte; struct rb_root block_group_cache_tree; /* keep track of unallocated space */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1db8a993882..04f98e3ffd9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2130,6 +2130,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 825f23b13b5..82400b2b251 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -161,6 +161,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_link_node(&block_group->cache_node, parent, p); rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + spin_unlock(&info->block_group_cache_lock); return 0; @@ -202,8 +206,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, break; } } - if (ret) + if (ret) { btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } spin_unlock(&info->block_group_cache_lock); return ret; @@ -4848,6 +4855,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) struct btrfs_block_group_cache *cache; u64 bytenr; + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); if (!cache) return 0; @@ -8059,6 +8073,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; spin_unlock(&root->fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); -- cgit v1.2.3-70-g09d2 From e6ec716f0ddbe51741ef261d0804f0c28038dda4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 17 Jan 2013 05:38:51 +0000 Subject: Btrfs: make raid attr array more readable The current code of raid attr arry is hard to understand and it is easy to introduce some problem if we modify the array. So I changed it and made it more readable. Cc: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 10 +++++++++- fs/btrfs/extent-tree.c | 22 +++++++++------------- fs/btrfs/volumes.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 59 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ee099f3f83..541ce9a9949 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -953,7 +953,15 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES 5 + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_NR_RAID_TYPES +}; #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 82400b2b251..174c4d5c692 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5545,20 +5545,16 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) int __get_raid_index(u64 flags) { - int index; - if (flags & BTRFS_BLOCK_GROUP_RAID10) - index = 0; + return BTRFS_RAID_RAID10; else if (flags & BTRFS_BLOCK_GROUP_RAID1) - index = 1; + return BTRFS_RAID_RAID1; else if (flags & BTRFS_BLOCK_GROUP_DUP) - index = 2; + return BTRFS_RAID_DUP; else if (flags & BTRFS_BLOCK_GROUP_RAID0) - index = 3; + return BTRFS_RAID_RAID0; else - index = 4; - - return index; + return BTRFS_RAID_SINGLE; } static int get_block_group_index(struct btrfs_block_group_cache *cache) @@ -7518,16 +7514,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) index = get_block_group_index(block_group); } - if (index == 0) { + if (index == BTRFS_RAID_RAID10) { dev_min = 4; /* Divide by 2 */ min_free >>= 1; - } else if (index == 1) { + } else if (index == BTRFS_RAID_RAID1) { dev_min = 2; - } else if (index == 2) { + } else if (index == BTRFS_RAID_DUP) { /* Multiply by 2 */ min_free <<= 1; - } else if (index == 3) { + } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; do_div(min_free, dev_min); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5349e17d886..1ce581d58d8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3562,13 +3562,48 @@ static int btrfs_cmp_device_info(const void *a, const void *b) } struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - { 2, 1, 0, 4, 2, 2 /* raid10 */ }, - { 1, 1, 2, 2, 2, 2 /* raid1 */ }, - { 1, 2, 1, 1, 1, 2 /* dup */ }, - { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 1, 1, 1, 1 /* single */ }, + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 1, + }, }; - + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, -- cgit v1.2.3-70-g09d2 From 963d678b0f7649300e3a67f2513ca9d830c6e303 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:10:51 +0000 Subject: Btrfs: use percpu counter for fs_info->delalloc_bytes fs_info->delalloc_bytes is accessed very frequently, so use percpu counter instead of the u64 variant for it to reduce the lock contention. This patch also fixed the problem that we access the variant without the lock protection.At worst, we would not flush the delalloc inodes, and just return ENOSPC error when we still have some free space in the fs. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 7 ++++--- fs/btrfs/disk-io.c | 18 ++++++++++++++---- fs/btrfs/extent-tree.c | 6 ++++-- fs/btrfs/inode.c | 6 ++++-- 4 files changed, 26 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c476281b66..50def99f537 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1392,6 +1392,7 @@ struct btrfs_fs_info { */ struct list_head ordered_extents; + spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered @@ -1452,7 +1453,10 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; s32 dirty_metadata_batch; + s32 delalloc_batch; + struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1468,9 +1472,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - spinlock_t delalloc_lock; - u64 delalloc_bytes; - /* data_alloc_cluster is only used in ssd mode */ struct btrfs_free_cluster data_alloc_cluster; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34ace168eeb..2c9498aefe8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2010,10 +2010,16 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_dirty_metadata_bytes; + goto fail_delalloc_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2269,6 +2275,7 @@ int open_ctree(struct super_block *sb, sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2731,6 +2738,8 @@ fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: @@ -3362,9 +3371,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + percpu_counter_sum(&fs_info->delalloc_bytes)); } free_extent_buffer(fs_info->extent_root->node); @@ -3412,6 +3421,7 @@ int close_ctree(struct btrfs_root *root) btrfs_mapping_tree_free(&fs_info->mapping_tree); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 174c4d5c692..115d1646bf5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3760,7 +3760,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, space_info = block_rsv->space_info; smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; @@ -3799,7 +3800,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, break; } smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fc8aa8bf80a..24c0b7805fe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1516,7 +1516,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1557,7 +1558,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && -- cgit v1.2.3-70-g09d2 From de98ced9e743656d108de41841797def0f5cb951 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:13:12 +0000 Subject: Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits There is no lock to protect fs_info->avail_{data, metadata, system}_alloc_bits, it may introduce some problem, such as the wrong profile information, so we add a seqlock to protect them. Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 22 ++++++++++++++------ fs/btrfs/volumes.c | 56 +++++++++++++++++++++++++++----------------------- 4 files changed, 49 insertions(+), 32 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 50def99f537..5f5c30aaef3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1483,6 +1483,8 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 11f6dbcb119..00b6742fdde 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2040,6 +2040,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->tree_mod_seq_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 115d1646bf5..faff98f720d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3227,12 +3227,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); } /* @@ -3324,12 +3326,18 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; + unsigned seq; + + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(root, flags); } @@ -7967,12 +7975,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1ce581d58d8..8c9ea4cd66b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1430,14 +1430,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) u64 devid; u64 num_devices; u8 *dev_uuid; + unsigned seq; int ret = 0; bool clear_super = false; mutex_lock(&uuid_mutex); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace); @@ -3043,6 +3048,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int mixed = 0; int ret; u64 num_devices; + unsigned seq; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3126,22 +3132,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } } - } + } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { int num_tolerated_disk_barrier_failures; @@ -3980,10 +3990,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, if (ret) return ret; - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); if (ret) @@ -3991,10 +3998,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); -- cgit v1.2.3-70-g09d2 From 96f1bb57771f71bf1d55d5031a1cf47908494330 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 Jan 2013 17:02:51 -0500 Subject: Btrfs: do not overcommit if we don't have enough space for global rsv Because of how little we allocate chunks now we can get really tight on metadata space before we will allocate a new chunk. This resulted in being unable to add device extents when allocating a new metadata chunk as we did not have enough space. This is because we were allowed to overcommit too much metadata without actually making sure we had enough space to make allocations. The idea behind overcommit is that we are allowed to say "sure you can have that reservation" when most of the free space is occupied by reservations, not actual allocations. But in this case where a majority of the total space is in use by actual allocations we can screw ourselves by not being able to make real allocations when it matters. So make sure we have enough real space for our global reserve, and if not then don't allow overcommitting. Thanks, Reported-and-tested-by: Jim Schutt Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index faff98f720d..e035731b360 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3672,13 +3672,30 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); + u64 rsv_size = 0; u64 avail; u64 used; used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + space_info->bytes_pinned + space_info->bytes_readonly; + + spin_lock(&global_rsv->lock); + rsv_size = global_rsv->size; + spin_unlock(&global_rsv->lock); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + rsv_size <<= 1; + if (used + rsv_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; spin_lock(&root->fs_info->free_chunk_lock); avail = root->fs_info->free_chunk_space; -- cgit v1.2.3-70-g09d2 From 1971e917c8c99ff190264305803fbafcbbac1422 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 31 Jan 2013 00:55:00 +0000 Subject: btrfs: remove unnecessary DEFINE_WAIT() declarations No point in DEFINE_WAIT(wait) if it's not used! Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e035731b360..3158817cd5a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5541,7 +5541,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5558,7 +5557,6 @@ static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) -- cgit v1.2.3-70-g09d2 From 70afa3998c9baed4186df38988246de1abdab56d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 6 Feb 2013 13:53:19 -0500 Subject: Btrfs: rework the overcommit logic to be based on the total size People have been complaining about random ENOSPC errors that will clear up after a umount or just a given amount of time. Chris was able to reproduce this with stress.sh and lots of processes and so was I. Basically the overcommit stuff would really let us get out of hand, in my tests I saw up to 30 gigs of outstanding reservations with only 2 gigs total of metadata space. This usually worked out fine but with so much outstanding reservation the flushing stuff short circuits to make sure we don't hang forever flushing when we really need ENOSPC. Plus we allocate chunks in order to alleviate the pressure, but this doesn't actually help us since we only use the non-allocated area in our over commit logic. So instead of basing overcommit on the amount of non-allocated space, instead just do it based on how much total space we have, and then limit it to the non-allocated space in case we are short on space to spill over into. This allows us to have the same performance as well as no longer giving random ENOSPC. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3158817cd5a..81aa7cf3ae8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3677,6 +3677,7 @@ static int can_overcommit(struct btrfs_root *root, u64 rsv_size = 0; u64 avail; u64 used; + u64 to_add; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -3710,17 +3711,25 @@ static int can_overcommit(struct btrfs_root *root, BTRFS_BLOCK_GROUP_RAID10)) avail >>= 1; + to_add = space_info->total_bytes; + /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; + to_add >>= 3; else - avail >>= 1; + to_add >>= 1; + + /* + * Limit the overcommit to the amount of free space we could possibly + * allocate for chunks. + */ + to_add = min(avail, to_add); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < space_info->total_bytes + to_add) return 1; return 0; } -- cgit v1.2.3-70-g09d2 From 5d80366e9b5e56b3ffc1923b4995e83bbbf605e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 7 Feb 2013 16:06:02 -0500 Subject: Btrfs: steal from global reserve if we are cleaning up orphans Sometimes xfstest 83 will fail to remount the scratch device because we've gotten ourselves so full that we cannot cleanup the orphan items. In this case check to see if we're doing the orphan cleanup and if we are allow us to steal our reservation from the global block rsv. With this patch I've not been able to reproduce the failed mount problem. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/extent-tree.c | 11 +++++++++++ fs/btrfs/inode.c | 5 ----- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7e2cffd2a5d..f1cc247f317 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1237,6 +1237,11 @@ struct seq_list { u64 seq; }; +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + /* fs_info */ struct reloc_control; struct btrfs_device; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 81aa7cf3ae8..1818dd90c27 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -102,6 +102,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int reserve); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -4099,6 +4101,15 @@ again: goto again; out: + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; + + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 348b7bb920e..cf26778085e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2218,11 +2218,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } } -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - /* * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv -- cgit v1.2.3-70-g09d2 From 0934856d4697e63c14056375e26e3bd6e8ebd34b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 7 Feb 2013 10:12:07 +0000 Subject: Btrfs: fix deadlock due to unsubmitted The deadlock problem happened when running fsstress(a test program in LTP). Steps to reproduce: # mkfs.btrfs -b 100M # mount # /fsstress -p 3 -n 10000000 -d The reason is: btrfs_direct_IO() |->do_direct_IO() |->get_page() |->get_blocks() | |->btrfs_delalloc_resereve_space() | |->btrfs_add_ordered_extent() ------- Add a new ordered extent |->dio_send_cur_page(page0) -------------- We didn't submit bio here |->get_page() |->get_blocks() |->btrfs_delalloc_resereve_space() |->flush_space() |->btrfs_start_ordered_extent() |->wait_event() ---------- Wait the completion of the ordered extent that is mentioned above But because we didn't submit the bio that is mentioned above, the ordered extent can not complete, we would wait for its completion forever. There are two methods which can fix this deadlock problem: 1. submit the bio before we invoke get_blocks() 2. reserve the space before we do dio Though the 1st is the simplest way, we need modify the code of VFS, and it is likely to break contiguous requests, and introduce performance regression for the other filesystems. So we have to choose the 2nd way. Signed-off-by: Miao Xie Cc: Josef Bacik Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 3 +- fs/btrfs/inode.c | 81 +++++++++++++++++++++++++------------------------- 2 files changed, 43 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1818dd90c27..51e1151a0a0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4748,7 +4748,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16925807a9e..d11f38d8696 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6059,16 +6059,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 len = bh_result->b_size; struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; - int ret; + int ret = 0; if (create) { - ret = btrfs_delalloc_reserve_space(inode, len); - if (ret) - return ret; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else { + } else len = min_t(u64, len, root->sectorsize); - } lockstart = start; lockend = start + len - 1; @@ -6080,14 +6079,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) return -ENOTBLK; - if (create) { - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - if (ret) - goto unlock_err; - } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -6119,7 +6110,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (!create && (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { free_extent_map(em); - ret = 0; goto unlock_err; } @@ -6217,6 +6207,11 @@ unlock: */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + BUG_ON(ret); } /* @@ -6225,24 +6220,9 @@ unlock: * aren't using if there is any left over space. */ if (lockstart < lockend) { - if (create && len < lockend - lockstart) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, - unlock_bits | EXTENT_DEFRAG, 1, 0, - &cached_state, GFP_NOFS); - /* - * Beside unlock, we also need to cleanup reserved space - * for the left range by attaching EXTENT_DO_ACCOUNTING. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, - lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); - } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); - } + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); } else { free_extent_state(cached_state); } @@ -6252,9 +6232,6 @@ unlock: return 0; unlock_err: - if (create) - unlock_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); return ret; @@ -6692,15 +6669,39 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = 0; + ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); + if (rw & WRITE) { + count = iov_length(iov, nr_segs); + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + return ret; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (rw & WRITE) { + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret > 0 && (size_t)ret < count) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + } + btrfs_delalloc_release_metadata(inode, 0); + } + + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) -- cgit v1.2.3-70-g09d2 From b069e0c3450ae388d7a9c94ded6d938a465de262 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 8 Feb 2013 21:28:17 +0000 Subject: btrfs: put some enospc messages under enospc_debug The warning in use_block_rsv is not useful for users and may fill the logs unnecessarily. Signed-off-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 51e1151a0a0..88831fab85d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6408,12 +6408,14 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret && !block_rsv->failfast) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", - ret); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { @@ -7730,11 +7732,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); - dump_space_info(space_info, 0, 0); + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } } list_del(&space_info->list); kfree(space_info); -- cgit v1.2.3-70-g09d2 From 24542bf7ea5e4fdfdb5157ff544c093fa4dcb536 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 16 Nov 2012 00:04:43 +0000 Subject: btrfs: limit fallocate extent reservation to 256MB Very large fallocate requests are cpu bound and result in extents with a repeating pattern of ever decreasing size: $ time fallocate -l 1T file real 0m13.039s ( an excerpt of the extents from btrfs-debug-tree: ) prealloc data disk byte 1536292564992 nr 397312 prealloc data disk byte 1536292962304 nr 196608 prealloc data disk byte 1536293158912 nr 98304 prealloc data disk byte 1536293257216 nr 49152 prealloc data disk byte 1536293306368 nr 24576 prealloc data disk byte 1536293330944 nr 12288 prealloc data disk byte 1536293343232 nr 8192 prealloc data disk byte 1536293351424 nr 4096 prealloc data disk byte 1536293355520 nr 4096 prealloc data disk byte 1536293359616 nr 4096 The excessive cpu use comes from __btrfs_prealloc_file_range() trying to allocate the entire remaining size after each extent is allocated. btrfs_reserve_extent() repeatedly cuts this requested size in half until it gets down to the size that the allocators can return. We limit the problem for now by capping each reservation at 256 meg. The small extents come from a masking bug when decreasing the requested reservation size. The high 32bits are cleared and the remaining low bits might happen to reserve a small size. Fix this by using round_down() which properly casts the mask. After these fixes huge fallocate requests are fast and result in nice large extents: $ time fallocate -l 1T file real 0m0.082s prealloc data disk byte 1112425889792 nr 268435456 prealloc data disk byte 1112694325248 nr 268435456 prealloc data disk byte 1112962760704 nr 268435456 Reported-by: Eric Sandeen Signed-off-by: Zach Brown Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b3ecca447dd..d2b3a5e9a62 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6143,7 +6143,7 @@ again: if (ret == -ENOSPC) { if (!final_tried) { num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); if (num_bytes == min_alloc_size) final_tried = true; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4e6a11c2cfd..3bc62b181ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7894,8 +7894,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); + ret = btrfs_reserve_extent(trans, root, + min(num_bytes, 256ULL * 1024 * 1024), + min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); -- cgit v1.2.3-70-g09d2 From a81cb9a2d939676dee6911993a37fa7e818ab72e Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Thu, 21 Feb 2013 21:15:14 +0000 Subject: clear chunk_alloc flag on retryable failure I've experienced filesystem freezes with permanent spikes in the active process count for quite a while, particularly on filesystems whose available raw space has already been fully allocated to chunks. While looking into this, I found a pretty obvious error in do_chunk_alloc: it sets space_info->chunk_alloc, but if btrfs_alloc_chunk returns an error other than ENOSPC, it returns leaving that flag set, which causes any other threads waiting for space_info->chunk_alloc to become zero to spin indefinitely. I haven't double-checked that this patch fixes the failure I've observed fully (it's not exactly trivial to trigger), but it surely is a bug and the fix is trivial, so... Please put it in :-) What I saw in that function also happens to explain why in some cases I see filesystems allocate a huge number of chunks that remain unused (leading to the scenario above, of not having more chunks to allocate). It happens for data and metadata, but not necessarily both. I'm guessing some thread sets the force_alloc flag on the corresponding space_info, and then several threads trying to get disk space end up attempting to allocate a new chunk concurrently. All of them will see the force_alloc flag and bump their local copy of force up to the level they see first, and they won't clear it even if another thread succeeds in allocating a chunk, thus clearing the force flag. Then each thread that observed the force flag will, on its turn, force the allocation of a new chunk. And any threads that come in while it does that will see the force flag still set and pick it up, and so on. This sounds like a problem to me, but... what should the correct behavior be? Clear force_flag once we copy it to a local force? Reset force to the incoming value on every loop? Set the flag to our incoming force if we have it at first, clear our local flag, and move it from the space_info when we determined that we are the thread that's going to perform the allocation? btrfs: clear chunk_alloc flag on retryable failure From: Alexandre Oliva If btrfs_alloc_chunk fails with e.g. ENOMEM, we exit do_chunk_alloc without clearing chunk_alloc in space_info. As a result, any further calls to do_chunk_alloc on that filesystem will start busy-waiting for chunk_alloc to be cleared, but it never will be. This patch adjusts do_chunk_alloc so that it clears this flag in case of an error. Signed-off-by: Alexandre Oliva Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d2b3a5e9a62..8520354f086 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3715,19 +3715,19 @@ again: ret = btrfs_alloc_chunk(trans, extent_root, flags); trans->allocating_chunk = false; - if (ret < 0 && ret != -ENOSPC) - goto out; spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; if (ret) space_info->full = 1; else ret = 1; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: mutex_unlock(&fs_info->chunk_mutex); return ret; } -- cgit v1.2.3-70-g09d2 From fda2832febb1928da0625b2c5d15559b29d7e740 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Feb 2013 08:10:22 +0000 Subject: btrfs: cleanup for open-coded alignment Though most of the btrfs codes are using ALIGN macro for page alignment, there are still some codes using open-coded alignment like the following: ------ u64 mask = ((u64)root->stripesize - 1); u64 ret = (val + mask) & ~mask; ------ Or even hidden one: ------ num_bytes = (end - start + blocksize) & ~(blocksize - 1); ------ Sometimes these open-coded alignment is not so easy to understand for newbie like me. This commit changes the open-coded alignment to the ALIGN macro for a better readability. Also there is a previous patch from David Sterba with similar changes, but the patch is for 3.2 kernel and seems not merged. http://www.spinics.net/lists/linux-btrfs/msg12747.html Cc: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 +++------ fs/btrfs/extent_io.c | 8 ++++---- fs/btrfs/file.c | 3 +-- fs/btrfs/inode.c | 37 +++++++++++++++---------------------- fs/btrfs/tree-log.c | 3 +-- fs/btrfs/volumes.c | 3 +-- 6 files changed, 25 insertions(+), 38 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8520354f086..5681a91ed40 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3431,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); if (root == root->fs_info->tree_root || BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { @@ -3526,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); @@ -5607,10 +5607,7 @@ static u64 stripe_align(struct btrfs_root *root, struct btrfs_block_group_cache *cache, u64 val, u64 num_bytes) { - u64 mask; - u64 ret; - mask = ((u64)root->stripesize - 1); - ret = (val + mask) & ~mask; + u64 ret = ALIGN(val, root->stripesize); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 66f999b97cb..597ab8966c8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2686,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; sector = em->block_start >> 9; @@ -2977,7 +2977,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; @@ -3664,7 +3664,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize - 1) & ~(blocksize - 1); + start += ALIGN(offset, blocksize); if (start > end) return 0; @@ -3783,7 +3783,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, len = last - offset; if (len == 0) break; - len = (len + sectorsize - 1) & ~(sectorsize - 1); + len = ALIGN(len, sectorsize); em = get_extent(inode, NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6e6dd8cdad9..83c790d8403 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -510,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index be09654e11b..9ef7a5b1b77 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -233,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; @@ -391,7 +390,7 @@ again: * a compressed extent to 128k. */ total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -490,15 +489,13 @@ cont: * up to a block size boundary so the allocator does sane * things */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); + total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -856,7 +853,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BUG_ON(btrfs_is_free_space_inode(inode)); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -4015,7 +4012,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 mask = root->sectorsize - 1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4039,7 +4035,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * extent just the way it is. */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -4118,10 +4115,9 @@ search_again: if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -4357,9 +4353,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 mask = root->sectorsize - 1; - u64 hole_start = (oldsize + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; @@ -4392,7 +4387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte , root->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; hole_size = last_byte - cur_offset; @@ -6111,8 +6106,7 @@ again: } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + extent_end = ALIGN(extent_start + size, root->sectorsize); } if (start >= extent_end) { @@ -6184,8 +6178,7 @@ again: copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; if (compress_type) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1a79087c457..e8b7a68e1b3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -484,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key *key) { int found_type; - u64 mask = root->sectorsize - 1; u64 extent_end; u64 start = key->offset; u64 saved_nbytes; @@ -501,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = start + btrfs_file_extent_num_bytes(eb, item); else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); - extent_end = (start + size + mask) & ~mask; + extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; goto out; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 538c5cfa005..db72e0cc6f8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4556,8 +4556,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; - stripe_nr_end = (offset + *length + map->stripe_len - 1) & - (~(map->stripe_len - 1)); + stripe_nr_end = ALIGN(offset + *length, map->stripe_len); do_div(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); -- cgit v1.2.3-70-g09d2 From d5c1207017cd8387b4d3224dd7ab6cf5cd7f1c9a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Feb 2013 10:04:33 +0000 Subject: Btrfs: fix wrong reserved space in qgroup during snap/subv creation There are two problems in the space reservation of the snapshot/ subvolume creation. - don't reserve the space for the root item insertion - the space which is reserved in the qgroup is different with the free space reservation. we need reserve free space for 7 items, but in qgroup reservation, we need reserve space only for 3 items. So we implement new metadata reservation functions for the snapshot/subvolume creation. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 9 +++++-- fs/btrfs/extent-tree.c | 65 ++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/ioctl.c | 62 ++++++++++++++++++++++++++++++++--------------- fs/btrfs/transaction.c | 4 +--- fs/btrfs/transaction.h | 1 + 5 files changed, 105 insertions(+), 36 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ae8dcc40680..0d82922179d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3106,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, + u64 *qgroup_reserved); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5681a91ed40..7cb9d734f6e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4560,19 +4560,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved) { - struct btrfs_root *root = pending->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; - /* - * two for root back/forward refs, two for directory entries, - * one for root of the snapshot and one for parent inode. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); - dst_rsv->space_info = src_rsv->space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + u64 num_bytes; + int ret; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); } /** diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8dcd4ff0c3a..56d92549389 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -363,7 +363,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } -static noinline int create_subvol(struct btrfs_root *root, +static noinline int create_subvol(struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, @@ -374,32 +374,39 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; - struct dentry *parent = dentry->d_parent; - struct inode *dir; + struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + u64 qgroup_reserved; uuid_le new_uuid; ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) return ret; - dir = parent->d_inode; - + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items + * The same as the snapshot creation, please see the comment + * of create_snapshot(). */ - trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) - return PTR_ERR(trans); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 7, &qgroup_reserved); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); if (ret) @@ -515,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -526,7 +535,8 @@ fail: if (!ret) d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); - +out: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); return ret; } @@ -549,21 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * 1 - parent dir inode + * 2 - dir entries + * 1 - root item + * 2 - root ref/backref + * 1 - root of snapshot + */ + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, 7, + &pending_snapshot->qgroup_reserved); + if (ret) + goto out; + pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; pending_snapshot->dir = dir; pending_snapshot->inherit = inherit; - trans = btrfs_start_transaction(root->fs_info->extent_root, 6); + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } - ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); - BUG_ON(ret); - spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); @@ -600,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + pending_snapshot->qgroup_reserved); +out: kfree(pending_snapshot); return ret; } @@ -733,8 +757,8 @@ static noinline int btrfs_mksubvol(struct path *parent, error = create_snapshot(snap_src, dir, dentry, name, namelen, async_transid, readonly, inherit); } else { - error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid, inherit); + error = create_subvol(dir, dentry, name, namelen, + async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 71de435a291..f11c2e0a374 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1082,7 +1082,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = pending->error = -ENOMEM; - goto path_alloc_fail; + return ret; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); @@ -1279,8 +1279,6 @@ no_free_objectid: kfree(new_root_item); root_item_alloc_fail: btrfs_free_path(path); -path_alloc_fail: - btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5f67fba07ab..3c8e0d25c8e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -90,6 +90,7 @@ struct btrfs_pending_snapshot { struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; /* extra metadata reseration for relocation */ int error; bool readonly; -- cgit v1.2.3-70-g09d2 From a9870c0e031527fbfa382019f30d2e9b98124a0d Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Fri, 1 Mar 2013 11:33:01 +0000 Subject: Btrfs: don't call btrfs_qgroup_free if just btrfs_qgroup_reserve fails commit eb6b88d92c6df083dd09a8c471011e3788dfd7c6 leads into another bug. If it is just because qgroup_reserve fails, the function btrfs_qgroup_free should not be called, otherwise, it will cause the wrong quota accounting. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7cb9d734f6e..075854e7a04 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4772,9 +4772,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * ret != 0 here means the qgroup reservation failed, we go straight to * the shared error handling then. */ - if (ret == 0) + if (ret == 0) { ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (ret && root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + } + } if (ret) { u64 to_free = 0; @@ -4805,10 +4810,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); } - if (root->fs_info->quota_enabled) { - btrfs_qgroup_free(root, num_bytes + - nr_extents * root->leafsize); - } if (delalloc_lock) mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; -- cgit v1.2.3-70-g09d2 From 88e081bf82ffcb9be3ad33d04c829051d66da564 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Fri, 1 Mar 2013 11:36:01 +0000 Subject: Btrfs: cleanup to make the function btrfs_delalloc_reserve_metadata more logic The original code is a little confusing and not clear, The right way to deal with the kernel code like this: [...] if (ret) goto out; [...] So i move the common clean_up code to the place labeled with out_fail, this will be easier to maintain. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 82 +++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 075854e7a04..aaee2b7fee7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4721,6 +4721,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -4764,55 +4766,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) + if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); + if (ret) + goto out_fail; + } - /* - * ret != 0 here means the qgroup reservation failed, we go straight to - * the shared error handling then. - */ - if (ret == 0) { - ret = reserve_metadata_bytes(root, block_rsv, - to_reserve, flush); - if (ret && root->fs_info->quota_enabled) { + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); - } - } - - if (ret) { - u64 to_free = 0; - unsigned dropped; - - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. - */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) - calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); - - if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; + goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); @@ -4833,6 +4799,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + /* + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + * Otherwise we have to do the normal free thing to account for + * the case that the free side didn't free up its reserve + * because of this outstanding reservation. + */ + if (BTRFS_I(inode)->csum_bytes == csum_bytes) + calc_csum_metadata_size(inode, num_bytes, 0); + else + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + return ret; } /** -- cgit v1.2.3-70-g09d2