diff options
Diffstat (limited to 'fs/btrfs')
46 files changed, 3713 insertions, 1318 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f341a98031d..6d1d0b93b1a 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index ff9b3995d45..9a0124a9585 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, const char *name; char *value = NULL; - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - return ret; - ret = 0; - } - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 10db21fa092..e25564bfcb4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -900,7 +900,11 @@ again: goto out; BUG_ON(ret == 0); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else if (trans) { +#endif /* * look if there are updates for this ref queued and lock the * head @@ -984,11 +988,12 @@ again: goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list) { + if (extent_item_pos && !ref->inode_list && + ref->level == 0) { u32 bsz; struct extent_buffer *eb; bsz = btrfs_level_size(fs_info->extent_root, - info_level); + ref->level); eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { @@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, * returns <0 on error */ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - struct btrfs_extent_inline_ref **out_eiref, - int *out_type) + struct btrfs_key *key, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) { unsigned long end; u64 flags; @@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, /* first call */ flags = btrfs_extent_flags(eb, ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - info = (struct btrfs_tree_block_info *)(ei + 1); - *out_eiref = - (struct btrfs_extent_inline_ref *)(info + 1); + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } } else { *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); } *ptr = (unsigned long)*out_eiref; - if ((void *)*ptr >= (void *)ei + item_size) + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) return -ENOENT; } end = (unsigned long)ei + item_size; - *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, * <0 on error. */ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level) + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) { int ret; int type; @@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 1; while (1) { - ret = __get_extent_inline_ref(ptr, eb, ei, item_size, - &eiref, &type); + ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); if (ret < 0) return ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a910b27a8ad..86fc20fec28 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level); + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, @@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c9a24444ec9..4794923c410 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0e8388e72d8..ce92ae30250 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error: next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) { + sf->error = -1; btrfsic_release_block_ctx( &sf-> next_block_ctx); @@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame: sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) + if (NULL == next_stack) { + sf->error = -1; goto one_stack_frame_backwards; + } next_stack->i = -1; next_stack->block = sf->next_block; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d43c544d3b6..92371c41422 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -1; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1bcfcdb23cf..aeab453b8e2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) static void add_root_to_dirty_list(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { + if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && + list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } @@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int level; struct btrfs_disk_key disk_key; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) @@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) } /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. - */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ - u64 seq = atomic64_read(&fs_info->tree_mod_seq); - seq &= 0xffffffff00000000ull; - seq += 1ull << 32; - atomic64_set(&fs_info->tree_mod_seq, seq); - return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. + * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ - return (seq & 0xffffffff00000000ull) - 1ull; -} - -/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - u64 seq; - tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); + elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); - return seq; + return elem->seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) BUG_ON(!tm); - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) return ret; @@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif /* ensure we can see the force_cow */ smp_rmb(); @@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } @@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) return ret; btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); - if (ret < 0) + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) return 0; return 1; } @@ -5736,6 +5718,24 @@ again: ret = 0; goto done; } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ba6b88528dc..b7e2c1c1ef3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/workqueue.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -756,6 +757,12 @@ struct btrfs_dir_item { #define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -840,7 +847,10 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - __le64 unused[8]; + /* BTRFS_BALANCE_ARGS_LIMIT value */ + __le64 limit; + + __le64 unused[7]; } __attribute__ ((__packed__)); /* @@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item { __le64 rsv_excl; } __attribute__ ((__packed__)); +/* For raid type sysfs entries */ +struct raid_kobject { + int raid_type; + struct kobject kobj; +}; + struct btrfs_space_info { spinlock_t lock; @@ -1163,7 +1179,7 @@ struct btrfs_space_info { wait_queue_head_t wait; struct kobject kobj; - struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 @@ -1313,6 +1329,8 @@ struct btrfs_stripe_hash_table { #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +void btrfs_init_async_reclaim_work(struct work_struct *work); + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1534,6 +1552,9 @@ struct btrfs_fs_info { */ struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; + + /* the extent workers do delayed refs on the extent allocation tree */ + struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; @@ -1636,7 +1657,10 @@ struct btrfs_fs_info { /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; + struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; + spinlock_t qgroup_op_lock; + atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1688,6 +1712,9 @@ struct btrfs_fs_info { struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; }; struct btrfs_subvolume_writers { @@ -1696,6 +1723,26 @@ struct btrfs_subvolume_writers { }; /* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS 1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX 3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 + +/* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ @@ -1706,6 +1753,7 @@ struct btrfs_root { struct btrfs_root *log_root; struct btrfs_root *reloc_root; + unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1740,7 +1788,6 @@ struct btrfs_root { /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; - bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1760,23 +1807,13 @@ struct btrfs_root { u64 highest_objectid; - /* btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So in_trans_setup - * is used to tell us when more checks are required - */ - unsigned long in_trans_setup; - int ref_cows; - int track_dirty; - int in_radix; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - int dummy_root; + u64 alloc_bytenr; #endif + u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - int defrag_running; char *name; /* the dirty list is only used by non-reference counted roots */ @@ -1790,7 +1827,6 @@ struct btrfs_root { spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; - int orphan_item_inserted; int orphan_cleanup_state; spinlock_t inode_lock; @@ -1808,8 +1844,6 @@ struct btrfs_root { */ dev_t anon_dev; - int force_cow; - spinlock_t root_item_lock; atomic_t refs; @@ -2788,6 +2822,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); @@ -2897,6 +2936,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); } static inline void @@ -2914,6 +2954,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); } /* struct btrfs_super_block */ @@ -3236,6 +3277,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3275,9 +3318,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, @@ -3285,7 +3328,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow); + u64 owner, u64 offset, int no_quota); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, @@ -3297,7 +3340,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow); + u64 root_objectid, u64 owner, u64 offset, int no_quota); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3385,7 +3428,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); - int btrfs_start_nocow_write(struct btrfs_root *root); void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ @@ -3561,7 +3603,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); -u64 btrfs_tree_mod_seq_prev(u64 seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ @@ -3708,6 +3749,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; @@ -4069,52 +4116,6 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* qgroup.c */ -struct qgroup_update { - struct list_head list; - struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op; -}; - -int btrfs_quota_enable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_quota_disable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); -void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - struct btrfs_qgroup_limit *limit); -int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); -void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, - struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -4131,6 +4132,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); #endif #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 33e561a8401..da775bfdebc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -149,8 +149,8 @@ again: spin_lock(&root->inode_lock); ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); if (ret == -EEXIST) { - kmem_cache_free(delayed_node_cache, node); spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); radix_tree_preload_end(); goto again; } @@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (atomic_dec_and_test(&delayed_node->refs)) { + bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); if (atomic_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); - kmem_cache_free(delayed_node_cache, delayed_node); + free = true; } spin_unlock(&root->inode_lock); + if (free) + kmem_cache_free(delayed_node_cache, delayed_node); } } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 31299646024..6d16bea94e1 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + if (ref1->no_quota > ref2->no_quota) + return 1; + if (ref1->no_quota < ref2->no_quota) + return -1; /* merging of sequenced refs is not allowed */ if (compare_seq) { if (ref1->seq < ref2->seq) @@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int for_cow) + int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ @@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int for_cow) + u64 offset, int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; @@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, - for_cow); + no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, for_cow); + action, no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4ba9b93022f..a764e2340d4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - -/* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9f2290509ac..2af6e66fe78 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -313,7 +313,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, RAID56)) { btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); - return -EINVAL; + return -EOPNOTSUPP; } switch (args->start.cont_reading_from_srcdev_mode) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 983314932af..8bb4aa19898 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@ #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -1109,6 +1110,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return alloc_test_extent_buffer(root->fs_info, bytenr, + blocksize); +#endif return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1201,10 +1207,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->nodesize = nodesize; root->leafsize = leafsize; root->stripesize = stripesize; - root->ref_cows = 0; - root->track_dirty = 0; - root->in_radix = 0; - root->orphan_item_inserted = 0; + root->state = 0; root->orphan_cleanup_state = 0; root->objectid = objectid; @@ -1265,7 +1268,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, else root->defrag_trans_start = 0; init_completion(&root->kobj_unregister); - root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1290,7 +1292,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) if (!root) return ERR_PTR(-ENOMEM); __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); - root->dummy_root = 1; + set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + root->alloc_bytenr = 0; return root; } @@ -1341,8 +1344,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); - root->track_dirty = 1; - + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); root->root_item.flags = 0; root->root_item.byte_limit = 0; @@ -1371,6 +1373,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, fail: if (leaf) { btrfs_tree_unlock(leaf); + free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } kfree(root); @@ -1396,13 +1399,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * DON'T set REF_COWS for log trees + * * log trees do not get reference counted because they go away * before a real commit is actually done. They do store pointers * to file data extents, and those reference counts still get * updated (along with back refs to the log tree). */ - root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, @@ -1536,7 +1541,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, return root; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - root->ref_cows = 1; + set_bit(BTRFS_ROOT_REF_COWS, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1606,7 +1611,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret == 0) - root->in_radix = 1; + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); @@ -1662,7 +1667,7 @@ again: if (ret < 0) goto fail; if (ret == 0) - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { @@ -2064,6 +2069,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->extent_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2090,7 +2096,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->chunk_root); } -static void del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2101,7 +2107,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (gang[0]->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); @@ -2221,6 +2227,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); @@ -2246,6 +2253,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; @@ -2291,6 +2299,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2354,6 +2363,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; @@ -2577,6 +2587,10 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("readahead", flags, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->submit_workers && fs_info->flush_workers && @@ -2586,6 +2600,7 @@ int open_ctree(struct super_block *sb, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->fixup_workers && fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; @@ -2693,7 +2708,7 @@ retry_root_backup: ret = PTR_ERR(extent_root); goto recovery_tree_root; } - extent_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); fs_info->extent_root = extent_root; location.objectid = BTRFS_DEV_TREE_OBJECTID; @@ -2702,7 +2717,7 @@ retry_root_backup: ret = PTR_ERR(dev_root); goto recovery_tree_root; } - dev_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); fs_info->dev_root = dev_root; btrfs_init_devices_late(fs_info); @@ -2712,13 +2727,13 @@ retry_root_backup: ret = PTR_ERR(csum_root); goto recovery_tree_root; } - csum_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); fs_info->csum_root = csum_root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; quota_root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(quota_root)) { - quota_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; fs_info->quota_root = quota_root; @@ -2733,7 +2748,7 @@ retry_root_backup: create_uuid_tree = true; check_uuid_tree = false; } else { - uuid_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); fs_info->uuid_root = uuid_root; create_uuid_tree = false; check_uuid_tree = @@ -2966,7 +2981,7 @@ fail_qgroup: fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); btrfs_cleanup_transaction(fs_info->tree_root); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -3501,8 +3516,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log(NULL, root); - __btrfs_remove_free_space_cache(root->free_ino_pinned); - __btrfs_remove_free_space_cache(root->free_ino_ctl); + if (root->free_ino_pinned) + __btrfs_remove_free_space_cache(root->free_ino_pinned); + if (root->free_ino_ctl) + __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); } @@ -3533,28 +3550,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i; - int ret; + int i = 0; + int err = 0; + unsigned int ret = 0; + int index; while (1) { + index = srcu_read_lock(&fs_info->subvol_srcu); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) + if (!ret) { + srcu_read_unlock(&fs_info->subvol_srcu, index); break; - + } root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { - int err; + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_fs_root(gang[i]); + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - return err; + break; + btrfs_put_fs_root(gang[i]); } root_objectid++; } - return 0; + + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_fs_root(gang[i]); + } + return err; } int btrfs_commit_super(struct btrfs_root *root) @@ -3603,6 +3643,8 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); + if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) @@ -3627,12 +3669,17 @@ int close_ctree(struct btrfs_root *root) btrfs_sysfs_remove_one(fs_info); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); free_root_pointers(fs_info, 1); @@ -3709,6 +3756,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, buf->len, root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(root, buf); + ASSERT(0); + } +#endif } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 53059df350f..23ce3ceba0a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5590af92094..fafb3e53ecd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,16 +26,16 @@ #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" #include "sysfs.h" +#include "qgroup.h" #undef SCRAMBLE_DELAYED_REFS @@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_extent_op *extra_op, + int no_quota); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + int level, struct btrfs_key *ins, + int no_quota); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -1271,7 +1273,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1307,6 +1309,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1764,7 +1767,8 @@ void update_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1808,6 +1812,7 @@ void update_inline_extent_backref(struct btrfs_root *root, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1839,7 +1844,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); update_inline_extent_backref(root, path, iref, - refs_to_add, extent_op); + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, @@ -1872,17 +1877,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { update_inline_extent_backref(root, path, iref, - -refs_to_drop, NULL); + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; @@ -1946,7 +1953,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1958,12 +1966,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } return ret; } @@ -1973,31 +1981,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, + int no_quota, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; u64 refs; int ret; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret != -EAGAIN) + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) goto out; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + goto out; + } + + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2005,9 +2046,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } + path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, @@ -2041,8 +2088,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) @@ -2056,13 +2102,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + node->no_quota, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + extent_op, node->no_quota); } else { BUG(); } @@ -2199,8 +2245,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; ins.objectid = node->bytenr; if (skinny_metadata) { @@ -2218,15 +2263,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins); + ref->level, &ins, + node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, node->no_quota, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, extent_op, + node->no_quota); } else { BUG(); } @@ -2574,42 +2622,6 @@ static u64 find_middle(struct rb_root *root) } #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct qgroup_update *qgroup_update; - int ret = 0; - - if (list_empty(&trans->qgroup_ref_list) != - !trans->delayed_ref_elem.seq) { - /* list without seq or seq without list */ - btrfs_err(fs_info, - "qgroup accounting update error, list is%s empty, seq is %#x.%x", - list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); - } - - if (!trans->delayed_ref_elem.seq) - return 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - qgroup_update = list_first_entry(&trans->qgroup_ref_list, - struct qgroup_update, list); - list_del(&qgroup_update->list); - if (!ret) - ret = btrfs_qgroup_account_ref( - trans, fs_info, qgroup_update->node, - qgroup_update->extent_op); - kfree(qgroup_update); - } - - btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); - - return ret; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2662,15 +2674,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); u64 avg_runtime; + u64 val; smp_mb(); avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; if (num_entries * avg_runtime >= NSEC_PER_SEC) return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; return btrfs_check_space_for_delayed_refs(trans, root); } +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, delayed_ref_async_start, + NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2698,8 +2789,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - delayed_refs = &trans->transaction->delayed_refs; if (count == 0) { count = atomic_read(&delayed_refs->num_entries) * 2; @@ -2758,6 +2847,9 @@ again: goto again; } out: + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2964,7 +3056,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc, int no_quota) { u64 bytenr; u64 num_bytes; @@ -2979,11 +3071,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -3014,7 +3110,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset, no_quota); if (ret) goto fail; } else { @@ -3022,7 +3118,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - for_cow); + no_quota); if (ret) goto fail; } @@ -3033,15 +3129,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3401,10 +3497,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); - kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); - } init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -4204,6 +4298,104 @@ static int flush_space(struct btrfs_root *root, return ret; } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + return (used >= div_factor_fine(space_info->total_bytes, 98) && + !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info) +{ + u64 used; + + spin_lock(&space_info->lock); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + return; + } while (flush_state <= COMMIT_TRANS); + + if (btrfs_need_do_async_reclaim(space_info, fs_info)) + queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -4311,8 +4503,13 @@ again: if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + if (need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } - spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4369,7 +4566,7 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) block_rsv = trans->block_rsv; if (root == root->fs_info->csum_root && trans->adding_csums) @@ -5621,7 +5818,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_key key; struct btrfs_path *path; @@ -5637,9 +5835,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5687,7 +5890,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5806,7 +6009,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { btrfs_err(info, "trying to drop %d refs but we only have %Lu " - "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5814,6 +6017,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -5829,7 +6033,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5850,6 +6054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -5872,6 +6077,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; + + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); + } out: btrfs_free_path(path); return ret; @@ -6008,11 +6227,15 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) + u64 owner, u64 offset, int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6028,13 +6251,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); + NULL, no_quota); } return ret; } @@ -6514,8 +6737,14 @@ loop: loop++; if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; @@ -6532,7 +6761,8 @@ loop: root, ret); else ret = 0; - btrfs_end_transaction(trans, root); + if (!exist) + btrfs_end_transaction(trans, root); if (ret) goto out; } @@ -6733,6 +6963,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6747,7 +6984,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + int level, struct btrfs_key *ins, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6757,6 +6995,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6790,6 +7029,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->leafsize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -6811,6 +7051,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6994,6 +7242,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + blocksize, level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -7735,7 +7992,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (root->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); @@ -8327,8 +8584,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; - kobj = &space_info->block_group_kobjs[i]; - if (kobj->parent) { + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { kobject_del(kobj); kobject_put(kobj); } @@ -8352,17 +8610,26 @@ static void __link_block_group(struct btrfs_space_info *space_info, up_write(&space_info->groups_sem); if (first) { - struct kobject *kobj = &space_info->block_group_kobjs[index]; + struct raid_kobject *rkobj; int ret; - kobject_get(&space_info->kobj); /* put in release */ - ret = kobject_add(kobj, &space_info->kobj, "%s", - get_raid_name(index)); + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); if (ret) { - pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); - kobject_put(&space_info->kobj); + kobject_put(&rkobj->kobj); + goto out_err; } + space_info->block_group_kobjs[index] = &rkobj->kobj; } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); } static struct btrfs_block_group_cache * @@ -8611,7 +8878,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) @@ -8697,6 +8964,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; struct inode *inode; + struct kobject *kobj = NULL; int ret; int index; int factor; @@ -8796,11 +9064,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, */ list_del_init(&block_group->list); if (list_empty(&block_group->space_info->block_groups[index])) { - kobject_del(&block_group->space_info->block_group_kobjs[index]); - kobject_put(&block_group->space_info->block_group_kobjs[index]); + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(root->fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475cee..f25a9092b94 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1693,6 +1693,7 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { max_bytes = PAGE_CACHE_SIZE; loops = 1; @@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); } return 0; } @@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page, } /* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) +{ + struct extent_io_tree *tree = epd->tree; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) { - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); - u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; - u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - int ret; - int nr = 0; size_t pg_offset = 0; size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - bool fill_delalloc = true; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; - else - write_flags = WRITE; - - trace___extent_writepage(page, inode, wbc); - - WARN_ON(!PageLocked(page)); - - ClearPageError(page); - - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - if (!tree->ops || !tree->ops->fill_delalloc) - fill_delalloc = false; - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked && fill_delalloc) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - &nr_written); - /* File system has been set read-only */ - if (ret) { - SetPageError(page); - goto done; - } - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } + int ret = 0; + int nr = 0; + bool compressed; - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); @@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, wbc->pages_skipped++; else redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - ret = 0; + ret = 1; goto done_unlocked; } } @@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (last_byte <= start) { + if (i_size <= start) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, blocksize = inode->i_sb->s_blocksize; while (cur <= end) { - if (cur >= last_byte) { + u64 em_end; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); break; } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; @@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; - continue; - } if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) { SetPageError(page); } else { - unsigned long max_nr = end_index + 1; + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { @@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } unlock_page(page); + return ret; done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); return 0; } @@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct extent_page_data *epd) +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) { unsigned long i, num_pages; int flush = 0; @@ -3458,7 +3523,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, static void end_extent_buffer_writeback(struct extent_buffer *eb) { clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } @@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) bio_put(bio); } -static int write_one_eb(struct extent_buffer *eb, +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) @@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; + int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -3776,8 +3842,8 @@ retry: unlock_page(page); ret = 0; } - if (ret) - done = 1; + if (!err && ret < 0) + err = ret; /* * the filesystem may choose to bump up nr_to_write. @@ -3789,7 +3855,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned && !done && !err) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -3799,7 +3865,7 @@ retry: goto retry; } btrfs_add_delayed_iput(inode); - return ret; + return err; } static void flush_epd_write_bio(struct extent_page_data *epd) @@ -4510,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) { unsigned long num_pages, i; @@ -4519,7 +4586,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); + if (p != accessed) + mark_page_accessed(p); } } @@ -4533,7 +4601,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + mark_extent_buffer_accessed(eb, NULL); return eb; } rcu_read_unlock(); @@ -4541,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(start, len); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { @@ -4581,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, spin_unlock(&mapping->private_lock); unlock_page(p); page_cache_release(p); - mark_extent_buffer_accessed(exists); + mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4596,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); - mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c488b45237b..8b63f2d4651 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -350,5 +350,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); #endif #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 127555b29f5..f46cfe45d68 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; + bio_index += count; while (count--) { disk_bytenr += bvec->bv_len; offset += bvec->bv_len; - bio_index++; bvec++; } } @@ -750,7 +750,7 @@ again: int slot = path->slots[0] + 1; /* we didn't find a csum item, insert one */ nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems - 1) { + if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); if (ret == 1) found_next = 1; @@ -885,3 +885,79 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em->bdev = root->fs_info->fs_devices->latest_bdev; + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } + + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + } else { + btrfs_err(root->fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, root %llu", + type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 17e7393c50f..1f2b99cb55e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -40,6 +40,7 @@ #include "tree-log.h" #include "locking.h" #include "volumes.h" +#include "qgroup.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -470,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -714,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; int modify_tree = -1; - int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int update_refs; int found = 0; int leafs_visited = 0; @@ -724,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -780,6 +784,18 @@ next_slot: extent_end = search_start; } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; @@ -835,7 +851,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset, 1); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -893,6 +909,7 @@ next_slot: * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -1191,7 +1208,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset, 1); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1994,8 +2011,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) + if (ret) { + btrfs_end_transaction(trans, root); goto out; + } } ret = btrfs_commit_transaction(trans, root); } else { @@ -2153,6 +2172,37 @@ out: return 0; } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2160,25 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); - u64 lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - u64 cur_offset = lockstart; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; int ret = 0; int err = 0; int rsv_count; - bool same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); - u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + u64 ino_size; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) return ret; mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. @@ -2190,8 +2257,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (same_page && len < PAGE_CACHE_SIZE) { if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); - mutex_unlock(&inode->i_mutex); - return ret; + goto out_only_mutex; } /* zero back part of the first page */ @@ -2203,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } } - /* zero the front end of the last page */ - if (offset + len < ino_size) { - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } } } @@ -2234,9 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if ((!ordered || (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_UPTODATE, 0, - cached_state)) { + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -2284,6 +2375,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) BUG_ON(ret); trans->block_rsv = rsv; + cur_offset = lockstart; + len = lockend - cur_offset; while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, @@ -2324,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) rsv, min_size); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } } if (ret) { @@ -2332,7 +2433,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - if (cur_offset < ino_size) { + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { err = ret; @@ -2357,6 +2463,7 @@ out_free: out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_only_mutex: mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 73f3de7a083..372b05ff194 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -831,7 +831,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, if (!matched) { __btrfs_remove_free_space_cache(ctl); - btrfs_err(fs_info, "block group %llu has wrong amount of free space", + btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->key.objectid); ret = -1; } @@ -843,7 +843,7 @@ out: spin_unlock(&block_group->lock); ret = 0; - btrfs_err(fs_info, "failed to load free space cache for block group %llu", + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", block_group->key.objectid); } @@ -851,90 +851,44 @@ out: return ret; } -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert - * - * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. - */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) { - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct rb_node *node; - struct list_head *pos, *n; - struct extent_state *cached_state = NULL; - struct btrfs_free_cluster *cluster = NULL; - struct extent_io_tree *unpin = NULL; - struct io_ctl io_ctl; - struct list_head bitmap_list; - struct btrfs_key key; - u64 start, extent_start, extent_end, len; - int entries = 0; - int bitmaps = 0; int ret; - int err = -1; - - INIT_LIST_HEAD(&bitmap_list); - - if (!i_size_read(inode)) - return -1; - - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return -1; + struct btrfs_free_cluster *cluster = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) + if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_entry(block_group->cluster_list.next, struct btrfs_free_cluster, block_group_list); + } - /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); - - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); cluster = NULL; } - /* Make sure we can fit our crcs into the first page */ - if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) - goto out_nospc; - - io_ctl_set_generation(&io_ctl, trans->transid); - /* Write out the extent entries */ while (node) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; + *entries += 1; - ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, e->bitmap); if (ret) - goto out_nospc; + goto fail; if (e->bitmap) { - list_add_tail(&e->list, &bitmap_list); - bitmaps++; + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; } node = rb_next(node); if (!node && cluster) { @@ -942,13 +896,84 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } } + return 0; +fail: + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int +add_ioctl_entries(struct btrfs_root *root, + struct inode *inode, + struct btrfs_block_group_cache *block_group, + struct io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct list_head *pos, *n; + struct extent_io_tree *unpin = NULL; + int ret; /* * We want to add any pinned extents to our free space cache * so we don't leak the space - */ - - /* + * * We shouldn't have switched the pinned extents yet so this is the * right one */ @@ -977,8 +1002,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, block_group->key.offset, extent_end + 1); len = extent_end - extent_start; - entries++; - ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); if (ret) goto out_nospc; @@ -986,74 +1011,129 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* Write out the bitmaps */ - list_for_each_safe(pos, n, &bitmap_list) { + list_for_each_safe(pos, n, bitmap_list) { struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) goto out_nospc; list_del_init(&entry->list); } /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); + io_ctl_zero_remaining_pages(io_ctl); - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, - 0, i_size_read(inode), &cached_state); - io_ctl_drop_pages(&io_ctl); + ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, + 0, i_size_read(inode), cached_state); + io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + i_size_read(inode) - 1, cached_state, GFP_NOFS); if (ret) - goto out; + goto fail; ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, GFP_NOFS); - goto out; + goto fail; } + return 0; - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; +fail: + return -1; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); - goto out; - } - leaf = path->nodes[0]; - if (ret > 0) { - struct btrfs_key found_key; - ASSERT(path->slots[0]); - path->slots[0]--; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || - found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); - btrfs_release_path(path); - goto out; - } +out_nospc: + return -ENOSPC; +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); } + io_ctl_drop_pages(io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, cached_state, + GFP_NOFS); +} - BTRFS_I(inode)->generation = trans->transid; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - btrfs_set_free_space_entries(leaf, header, entries); - btrfs_set_free_space_bitmaps(leaf, header, bitmaps); - btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct extent_state *cached_state = NULL; + struct io_ctl io_ctl; + struct list_head bitmap_list; + int entries = 0; + int bitmaps = 0; + int ret; + int err = -1; + + INIT_LIST_HEAD(&bitmap_list); + + if (!i_size_read(inode)) + return -1; + + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return -1; + + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state); + + + /* Make sure we can fit our crcs into the first page */ + if (io_ctl.check_crcs && + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + goto out_nospc; + + io_ctl_set_generation(&io_ctl, trans->transid); + + ret = write_cache_extent_entries(&io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc; + + ret = add_ioctl_entries(root, inode, block_group, &io_ctl, + &cached_state, &bitmap_list, &entries); + + if (ret == -ENOSPC) + goto out_nospc; + else if (ret) + goto out; + + err = update_cache_item(trans, root, inode, path, offset, + entries, bitmaps); - err = 0; out: io_ctl_free(&io_ctl); if (err) { @@ -1064,14 +1144,8 @@ out: return err; out_nospc: - list_for_each_safe(pos, n, &bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - list_del_init(&entry->list); - } - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 86935f5ae29..888fbe19079 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -174,7 +174,7 @@ static void start_caching(struct btrfs_root *root) BTRFS_LAST_FREE_OBJECTID - objectid + 1); } - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); if (IS_ERR(tsk)) { btrfs_warn(root->fs_info, "failed to start inode caching task"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c8386f1961f..8925f66a141 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, @@ -2678,6 +2678,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out_unlock; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2947,14 +2948,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_block_rsv = NULL; spin_unlock(&root->orphan_lock); - if (root->orphan_item_inserted && + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) btrfs_abort_transaction(trans, root, ret); else - root->orphan_item_inserted = 0; + clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state); } if (block_rsv) { @@ -3271,7 +3273,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_block_rsv_release(root, root->orphan_block_rsv, (u64)-1); - if (root->orphan_block_rsv || root->orphan_item_inserted) { + if (root->orphan_block_rsv || + test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); @@ -3473,7 +3476,7 @@ cache_acl: ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(root->fs_info, - "error loading props for ino %llu (root %llu): %d\n", + "error loading props for ino %llu (root %llu): %d", btrfs_ino(inode), root->root_key.objectid, ret); } @@ -3998,7 +4001,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * not block aligned since we will be keeping the last block of the * extent just the way it is. */ - if (root->ref_cows || root == root->fs_info->tree_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, ALIGN(new_size, root->sectorsize), (u64)-1, 0); @@ -4091,7 +4095,9 @@ search_again: extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (root->ref_cows && extent_start != 0) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state) && + extent_start != 0) inode_sub_bytes(inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { @@ -4105,7 +4111,8 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) inode_sub_bytes(inode, num_dec); } } @@ -4120,10 +4127,9 @@ search_again: btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } /* * update the ram bytes to properly reflect @@ -4133,7 +4139,8 @@ search_again: size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); - } else if (root->ref_cows) { + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); } @@ -4155,8 +4162,9 @@ delete: } else { break; } - if (found_extent && (root->ref_cows || - root == root->fs_info->tree_root)) { + if (found_extent && + (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, @@ -5168,8 +5176,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry) static void btrfs_dentry_release(struct dentry *dentry) { - if (dentry->d_fsdata) - kfree(dentry->d_fsdata); + kfree(dentry->d_fsdata); } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, @@ -5553,6 +5560,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; + int nitems = name ? 2 : 1; unsigned long ptr; int ret; @@ -5572,7 +5580,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ inode->i_ino = objectid; - if (dir) { + if (dir && name) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(dir, index); @@ -5581,6 +5589,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, iput(inode); return ERR_PTR(ret); } + } else if (dir) { + *index = 0; } /* * index_cnt is ignored for everything but a dir, @@ -5605,21 +5615,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; - /* - * Start new inodes with an inode_ref. This is slightly more - * efficient for small numbers of hard links since they will - * be packed into one item. Extended refs will kick in if we - * add more hard links than can fit in the ref item. - */ - key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); + + if (name) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail; @@ -5632,12 +5645,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (name) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); @@ -5673,7 +5688,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return inode; fail: - if (dir) + if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); iput(inode); @@ -5958,6 +5973,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_update_inode(trans, root, inode); if (err) goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, inode); + if (err) + goto fail; + } d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -6086,16 +6110,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); - if (ret) { - char *kaddr = kmap_atomic(page); - unsigned long copy_size = min_t(u64, - PAGE_CACHE_SIZE - pg_offset, - max_size - extent_offset); - memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr); - } kfree(tmp); - return 0; + return ret; } /* @@ -6113,7 +6129,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, { int ret; int err = 0; - u64 bytenr; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6127,7 +6142,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compress_type; + const bool new_inline = !page || create; again: read_lock(&em_tree->lock); @@ -6201,7 +6216,6 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -6236,32 +6250,10 @@ next: goto not_found_em; } - em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, item); - em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); - bytenr = btrfs_file_extent_disk_bytenr(leaf, item); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; - goto insert; - } - if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - em->block_start = bytenr; - em->block_len = em->orig_block_len; - } else { - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; - em->block_len = em->len; - if (found_type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; @@ -6270,12 +6262,8 @@ next: size_t extent_offset; size_t copy_size; - em->block_start = EXTENT_MAP_INLINE; - if (!page || create) { - em->start = extent_start; - em->len = extent_end - extent_start; + if (new_inline) goto out; - } size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; @@ -6285,10 +6273,6 @@ next: em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; - if (compress_type) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != @@ -6296,7 +6280,10 @@ next: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + err = ret; + goto out; + } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6332,8 +6319,6 @@ next: set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; - } else { - WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; @@ -6717,6 +6702,76 @@ out: return ret; } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ + struct radix_tree_root *root = &inode->i_mapping->page_tree; + int found = false; + void **pagep = NULL; + struct page *page = NULL; + int start_idx; + int end_idx; + + start_idx = start >> PAGE_CACHE_SHIFT; + + /* + * end is the last byte in the last page. end == start is legal + */ + end_idx = end >> PAGE_CACHE_SHIFT; + + rcu_read_lock(); + + /* Most of the code in this while loop is lifted from + * find_get_page. It's been modified to begin searching from a + * page and return just the first page found in that range. If the + * found idx is less than or equal to the end idx then we know that + * a page exists. If no pages are found or if those pages are + * outside of the range then we're fine (yay!) */ + while (page == NULL && + radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + page = NULL; + continue; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + page = NULL; + break; /* TODO: Is this relevant for this use case? */ + } + + if (!page_cache_get_speculative(page)) { + page = NULL; + continue; + } + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + page = NULL; + } + } + + if (page) { + if (page->index <= end_idx) + found = true; + page_cache_release(page); + } + + rcu_read_unlock(); + return found; +} + static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, int writing) { @@ -6741,10 +6796,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * invalidate needs to happen so that reads after a write do not * get stale data. */ - if (!ordered && (!writing || - !test_range_bit(&BTRFS_I(inode)->io_tree, - lockstart, lockend, EXTENT_UPTODATE, 0, - *cached_state))) + if (!ordered && + (!writing || + !btrfs_page_exists_in_range(inode, lockstart, lockend))) break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -7126,7 +7180,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) * before atomic variable goto zero, we must make sure * dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } /* if there are more bios still pending for this dio, just exit */ @@ -7306,7 +7360,7 @@ out_err: * before atomic variable goto zero, we must * make sure dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -7438,7 +7492,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return 0; atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * The generic stuff only does filemap_write_and_wait_range, which @@ -7981,7 +8035,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d\n", + "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); err = btrfs_update_inode(trans, new_root, inode); @@ -8300,7 +8354,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, @@ -8878,6 +8932,66 @@ static int btrfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + u64 objectid; + u64 index; + int ret = 0; + + /* + * 5 units required for adding orphan entry + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + goto out; + + inode = btrfs_new_inode(trans, root, dir, NULL, 0, + btrfs_ino(dir), objectid, mode, &index); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + goto out; + } + + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out; + + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + + d_tmpfile(dentry, inode); + mark_inode_dirty(inode); + +out: + btrfs_end_transaction(trans, root); + if (ret) + iput(inode); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -8898,6 +9012,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2f6d7b13b5b..82c18ba12e3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -58,6 +58,7 @@ #include "dev-replace.h" #include "props.h" #include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -638,11 +639,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_trans_handle *trans; int ret; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; atomic_inc(&root->will_be_snapshoted); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); btrfs_wait_nocow_write(root); ret = btrfs_start_delalloc_inodes(root, 0); @@ -711,6 +712,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -1502,11 +1532,12 @@ static noinline int btrfs_ioctl_resize(struct file *file, sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { - char *end; sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_free; if (!devid) { ret = -EINVAL; goto out_free; @@ -1562,7 +1593,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { - ret = -EINVAL; + ret = -ERANGE; goto out_free; } new_size = old_size + new_size; @@ -2219,6 +2250,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; + u64 root_flags; u64 qgroup_reserved; int namelen; int ret; @@ -2240,6 +2272,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) goto out_drop_write; @@ -2301,6 +2334,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } mutex_lock(&inode->i_mutex); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the i_mutex so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); + err = -EPERM; + goto out_dput; + } + err = d_invalidate(dentry); if (err) goto out_unlock; @@ -2346,7 +2400,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - if (!xchg(&dest->orphan_item_inserted, 1)) { + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); @@ -2389,11 +2443,19 @@ out_release: out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + ASSERT(dest->send_in_progress == 0); /* the last ref */ if (dest->cache_inode) { @@ -2557,9 +2619,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); if (!fi_args) return -ENOMEM; @@ -2574,6 +2633,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) } mutex_unlock(&fs_devices->device_list_mutex); + fi_args->nodesize = root->fs_info->super_copy->nodesize; + fi_args->sectorsize = root->fs_info->super_copy->sectorsize; + fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; @@ -2589,9 +2652,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) int ret = 0; char *s_uuid = NULL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); @@ -2669,10 +2729,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); ordered = btrfs_lookup_first_ordered_extent(inode, off + len - 1); - if (!ordered && + if ((!ordered || + ordered->file_offset + ordered->len <= off || + ordered->file_offset >= off + len) && !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); break; + } unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); @@ -2912,6 +2977,126 @@ out: return ret; } +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr. If it does then we need to update the quota for this root. This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 disko) +{ + struct seq_list tree_mod_seq_elem = {}; + struct ulist *roots; + struct ulist_iterator uiter; + struct ulist_node *root_node = NULL; + int ret; + + if (!root->fs_info->quota_enabled) + return 1; + + btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + ret = btrfs_find_all_roots(trans, root->fs_info, disko, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + ret = 0; + ULIST_ITER_INIT(&uiter); + while ((root_node = ulist_next(roots, &uiter))) { + if (root_node->val == root->objectid) { + ret = 1; + break; + } + } + ulist_free(roots); +out: + btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + return ret; +} + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + return; + } + + if (fi) { + btrfs_extent_item_to_extent_map(inode, path, fi, false, em); + em->generation = -1; + if (btrfs_file_extent_type(path->nodes[0], fi) == + BTRFS_FILE_EXTENT_INLINE) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + em->start = hole_offset; + em->len = hole_len; + em->ram_bytes = em->len; + em->orig_start = hole_offset; + em->block_start = EXTENT_MAP_HOLE; + em->block_len = 0; + em->orig_block_len = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = trans->transid; + } + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + + if (unlikely(ret)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -2924,7 +3109,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -2935,7 +3121,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - u64 len = olen_aligned; + int no_quota; + const u64 len = olen_aligned; + u64 last_disko = 0; + u64 last_dest_end = destoff; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -2952,7 +3141,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* @@ -2964,9 +3153,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode, 0, 0); if (ret < 0) goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } nritems = btrfs_header_nritems(path->nodes[0]); process_slot: + no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -2991,7 +3192,7 @@ process_slot: u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; - u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3012,10 +3213,16 @@ process_slot: extent); } - if (key.offset + datal <= off || - key.offset >= off + len - 1) { + /* + * The first search might have left us at an extent + * item that ends before our target range's start, can + * happen if we have holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { path->slots[0]++; goto process_slot; + } else if (key.offset >= off + len) { + break; } size = btrfs_item_size_nr(leaf, slot); @@ -3034,6 +3241,18 @@ process_slot: new_key.offset = destoff; /* + * Deal with a hole that doesn't have an extent item + * that represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning + * range or at the beginning (fully overlaps it or + * partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3051,18 +3270,18 @@ process_slot: * | ------------- extent ------------- | */ - /* substract range b */ + /* subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; - /* substract range a */ + /* subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, + drop_start, new_key.offset + datal, 1); if (ret) { @@ -3099,6 +3318,28 @@ process_slot: datao); btrfs_set_file_extent_num_bytes(leaf, extent, datal); + + /* + * We need to look up the roots that point at + * this bytenr and see if the new root does. If + * it does not we need to make sure we update + * quotas appropriately. + */ + if (disko && root != BTRFS_I(src)->root && + disko != last_disko) { + no_quota = check_ref(trans, root, + disko); + if (no_quota < 0) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + ret = no_quota; + goto out; + } + } + if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, @@ -3106,7 +3347,7 @@ process_slot: root->root_key.objectid, btrfs_ino(inode), new_key.offset - datao, - 0); + no_quota); if (ret) { btrfs_abort_transaction(trans, root, @@ -3141,7 +3382,7 @@ process_slot: aligned_end = ALIGN(new_key.offset + datal, root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, + drop_start, aligned_end, 1); if (ret) { @@ -3174,40 +3415,69 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); } + /* If we have an implicit hole (NO_HOLES feature). */ + if (drop_start < new_key.offset) + clone_update_extent_map(inode, trans, + path, NULL, drop_start, + new_key.offset - drop_start); + + clone_update_extent_map(inode, trans, path, + extent, 0, 0); + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - /* - * we round up to the block size at eof when - * determining which extents to clone above, - * but shouldn't round up the file size - */ - endoff = new_key.offset + datal; - if (endoff > destoff+olen) - endoff = destoff+olen; - if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); + last_dest_end = new_key.offset + datal; + ret = clone_finish_inode_update(trans, inode, + last_dest_end, + destoff, olen); + if (ret) goto out; - } - ret = btrfs_end_transaction(trans, root); + if (new_key.offset + datal >= destoff + len) + break; } btrfs_release_path(path); key.offset++; } ret = 0; + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole (NO_HOLES feature is enabled) that + * fully or partially overlaps our cloning range at its end. + */ + btrfs_release_path(path); + + /* + * 1 - remove extent(s) + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + last_dest_end, destoff + len, 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); + if (ret) + goto out; + clone_update_extent_map(inode, trans, path, NULL, last_dest_end, + destoff + len - last_dest_end); + } + out: - btrfs_release_path(path); btrfs_free_path(path); vfree(buf); return ret; @@ -3319,15 +3589,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + /* + * Lock the target range too. Right after we replace the file extent + * items in the fs tree (which now point to the cloned data), we might + * have a worker replace them with extent items relative to a write + * operation that was issued before this clone operation (i.e. confront + * with inode.c:btrfs_finish_ordered_io). + */ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, off, len); + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } ret = btrfs_clone(src, inode, off, olen, len, destoff); - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); + } else { + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, + destoff + len - 1); + } + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: if (!same_inode) { if (inode < src) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b47f669aca7..dfad8514f0d 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws, if (ret != LZO_E_OK) { printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); - ret = -1; + ret = -EIO; goto out; } @@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } @@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws, /* we're making it bigger, give up */ if (tot_in > 8192 && tot_in < tot_out) { - ret = -1; + ret = -E2BIG; goto out; } @@ -335,7 +335,7 @@ cont: break; if (page_in_index + 1 >= total_pages_in) { - ret = -1; + ret = -EIO; goto done; } @@ -358,7 +358,7 @@ cont: kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed\n"); - ret = -1; + ret = -EIO; break; } @@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed!\n"); - ret = -1; + ret = -EIO; goto out; } if (out_len < start_byte) { - ret = -1; + ret = -EIO; goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a94b05f7286..e12441c7cf1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", offset); + "%llu", offset); } /* diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2cf905877aa..cf5aead95a7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -32,6 +32,7 @@ #include "ulist.h" #include "backref.h" #include "extent_io.h" +#include "qgroup.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -84,8 +85,8 @@ struct btrfs_qgroup { /* * temp variables for accounting operations */ - u64 tag; - u64 refcnt; + u64 old_refcnt; + u64 new_refcnt; }; /* @@ -98,6 +99,9 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, return -ENOENT; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + return 0; +#endif path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -1174,33 +1201,198 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return -1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} /* - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts - * the modification into a list that's later used by btrfs_end_transaction to - * pass the recorded modifications on to btrfs_qgroup_account_ref. + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. */ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) { - struct qgroup_update *u; + struct btrfs_qgroup_operation *oper; + int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; - BUG_ON(!trans->delayed_ref_elem.seq); - u = kmalloc(sizeof(*u), GFP_NOFS); - if (!u) + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) return -ENOMEM; - u->node = node; - u->extent_op = extent_op; - list_add_tail(&u->list, &trans->qgroup_ref_list); + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); return 0; } -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq) +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct btrfs_qgroup *qgroup; + struct ulist *tmp; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); + qgroup->excl += sign * oper->num_bytes; + qgroup->excl_cmpr += sign * oper->num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + qgroup->excl += sign * oper->num_bytes; + if (sign < 0) + WARN_ON(qgroup->excl < oper->num_bytes); + qgroup->excl_cmpr += sign * oper->num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1211,256 +1403,549 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; ulist_reinit(tmp); - /* XXX id not needed */ - ret = ulist_add(tmp, qg->qgroupid, - (u64)(uintptr_t)qg, GFP_ATOMIC); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; else - ++qg->refcnt; + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; ret = ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + + ulist_reinit(tmp); + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } return 0; } -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes, - struct btrfs_qgroup *qgroup) +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; int ret; ulist_reinit(tmp); - ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); if (ret < 0) return ret; - ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * num_bytes; - qg->rfer_cmpr += sgn * num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * num_bytes; - qg->excl_cmpr += sgn * num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; + struct btrfs_qgroup_list *glist; + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } - return 0; } -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes) +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - int ret; + u64 cur_new_count, cur_old_count; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; - ulist_reinit(tmp); - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - if (ret < 0) - return ret; + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * num_bytes; - qg->excl_cmpr -= sgn * num_bytes; - qgroup_dirty(fs_info, qg); - } + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) - return ret; - } + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; } - } + if (dirty) + qgroup_dirty(fs_info, qg); + } return 0; } /* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. */ -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) { - struct btrfs_root *quota_root; - u64 ref_root; - struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - u64 seq; + struct ulist_node *unode; + struct ulist_iterator uiter; int ret = 0; - int sgn; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - struct btrfs_delayed_tree_ref *ref; - ref = btrfs_delayed_node_to_tree_ref(node); - ref_root = ref->root; - } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - ref_root = ref->root; - } else { - BUG(); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); - if (!is_fstree(ref_root)) { - /* - * non-fs-trees are not being accounted - */ - return 0; - } + return ret; +} - switch (node->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - sgn = 1; - seq = btrfs_tree_mod_seq_prev(node->seq); - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - seq = node->seq; - break; - case BTRFS_UPDATE_DELAYED_HEAD: - return 0; - default: - BUG(); - } +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = {}; + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) return 0; - } } - mutex_unlock(&fs_info->qgroup_rescan_lock); - /* - * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass - * tree_mod_log_prev_seq(node->seq) to skip - * the delayed ref's current sequence number, because we need the state - * of the tree before the add operation. for delete operations, we pass - * (node->seq) to include the delayed ref's current sequence number, - * because we need the state of the tree after the delete operation. - */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); - if (ret < 0) - return ret; - - spin_lock(&fs_info->qgroup_lock); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) - goto unlock; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; - qgroup = find_qgroup_rb(fs_info, ref_root); + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); if (!qgroup) - goto unlock; + goto out; + seq = fs_info->qgroup_seq; /* - * step 1: for each old ref, visit all nodes once and inc refcnt + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... */ - ulist_reinit(fs_info->qgroup_ulist); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; - ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, - seq); - if (ret) - goto unlock; + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; /* - * step 2: walk from the new root + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. */ - ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes, qgroup); - if (ret) - goto unlock; + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; /* - * step 3: walk again from old refs + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. */ - ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes); - if (ret) - goto unlock; + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; -unlock: + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } return ret; } @@ -1629,8 +2114,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; - dstgroup->rfer = srcgroup->rfer - level_size; - dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; qgroup_dirty(fs_info, dstgroup); @@ -1734,7 +2227,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -1766,7 +2259,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved += num_bytes; } @@ -1812,7 +2305,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved -= num_bytes; @@ -1848,15 +2341,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *tmp, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; struct seq_list tree_mod_seq_elem = {}; + u64 num_bytes; u64 seq; + int new_roots; int slot; int ret; @@ -1897,8 +2390,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { - u64 num_bytes; - btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -1908,76 +2399,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ret = btrfs_find_all_roots(trans, fs_info, found.objectid, - tree_mod_seq_elem.seq, &roots); + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); if (ret < 0) goto out; spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); - if (ret) { + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); goto out; } - /* - * step2 of btrfs_qgroup_account_ref works from a single root, - * we're doing all at once here. - */ - ulist_reinit(tmp); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } - - /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += num_bytes; - qg->rfer_cmpr += num_bytes; - WARN_ON(qg->tag >= seq); - if (qg->refcnt - seq == roots->nnodes) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - } - qgroup_dirty(fs_info, qg); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; } - spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); - ret = 0; } - out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -1990,13 +2439,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; path = btrfs_alloc_path(); if (!path) goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; tmp = ulist_alloc(GFP_NOFS); if (!tmp) goto out; @@ -2015,7 +2467,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - tmp, scratch_leaf); + qgroups, tmp, scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2025,6 +2477,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); + ulist_free(qgroups); ulist_free(tmp); btrfs_free_path(path); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 00000000000..5952ff1fbd7 --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction. The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding. This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols. This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { + BTRFS_QGROUP_OPER_ADD_EXCL, + BTRFS_QGROUP_OPER_ADD_SHARED, + BTRFS_QGROUP_OPER_SUB_EXCL, + BTRFS_QGROUP_OPER_SUB_SHARED, +}; + +struct btrfs_qgroup_operation { + u64 ref_root; + u64 bytenr; + u64 num_bytes; + u64 seq; + enum btrfs_qgroup_operation_type type; + struct seq_list elem; + struct rb_node n; + struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, + int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f92ab1daa8..65245a07275 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) if (bnode->root) fs_info = bnode->root->fs_info; btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", bytenr); + "found at offset %llu", bytenr); } /* @@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root) { struct btrfs_root *reloc_root; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; reloc_root = root->reloc_root; @@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, root_objectid); BUG_ON(IS_ERR(root)); - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && generation != btrfs_root_generation(&root->root_item)) return NULL; @@ -887,7 +887,7 @@ again: goto out; } - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) cur->cowonly = 1; if (btrfs_root_level(&root->root_item) == cur->level) { @@ -954,7 +954,8 @@ again: upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) upper->cowonly = 1; /* @@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (rb_node) { btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " - "tree\n", node->bytenr); + "tree", node->bytenr); kfree(node); return -EEXIST; } @@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, next = walk_up_backref(next, edges, &index); root = next->root; BUG_ON(!root); - BUG_ON(!root->ref_cows); + BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); @@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, BUG_ON(!root); /* no other choice for non-references counted tree */ - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return root; if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) @@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || root->ref_cows) { + if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; } if (root) { - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); BUG_ON(!list_empty(&node->list)); btrfs_record_root_in_trans(trans, root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 38bb47e7d6b..360a728a639 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) break; } - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); if (err) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0be77993378..ac80188eec8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { - ret = tree_backref_for_extent(&ptr, eb, ei, item_size, - &ref_root, &ref_level); + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " @@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) out: if (page) put_page(page); - if (inode) - iput(inode); + + iput(inode); if (ret < 0) return ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fd38b505347..6528aa66218 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -360,10 +360,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) /* * First time the inline_buf does not suffice */ - if (p->buf == p->inline_buf) + if (p->buf == p->inline_buf) { tmp_buf = kmalloc(len, GFP_NOFS); - else + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); + } else { tmp_buf = krealloc(p->buf, len, GFP_NOFS); + } if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; @@ -972,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - const int buf_len = PATH_MAX; + int buf_len; u32 name_len; u32 data_len; u32 cur; @@ -982,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; + if (found_key->type == BTRFS_XATTR_ITEM_KEY) + buf_len = BTRFS_MAX_XATTR_SIZE(root); + else + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1003,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - /* - * Path too long - */ - if (name_len + data_len > buf_len) { - ret = -ENAMETOOLONG; - goto out; + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > buf_len) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > buf_len) { + ret = -ENAMETOOLONG; + goto out; + } } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -1346,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -EIO; btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " - "disk_byte=%llu found extent=%llu\n", + "disk_byte=%llu found extent=%llu", ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -1625,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } *found_inode = key.objectid; *found_type = btrfs_dir_type(path->nodes[0], di); @@ -1690,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; btrfs_release_path(path); - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } *dir = parent_dir; @@ -1709,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root, int ret; struct fs_path *tmp_name; u64 tmp_dir; - u64 tmp_dir_gen; tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out; @@ -2026,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; /* @@ -2052,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, } } - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in @@ -2131,7 +2150,6 @@ out_cache: name_cache_clean_unused(sctx); out: - btrfs_free_path(path); return ret; } @@ -2942,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx, static int add_pending_dir_move(struct send_ctx *sctx, u64 ino, u64 ino_gen, - u64 parent_ino) + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2974,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx, } } - list_for_each_entry(cur, &sctx->deleted_refs, list) { + list_for_each_entry(cur, deleted_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; } - list_for_each_entry(cur, &sctx->new_refs, list) { + list_for_each_entry(cur, new_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; @@ -3022,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3033,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; + u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3051,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (ret < 0) goto out; - if (parent_ino == sctx->cur_ino) { - /* child only renamed, not moved */ - ASSERT(parent_gen == sctx->cur_inode_gen); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); - if (ret < 0) - goto out; - } else { - /* child moved and maybe renamed too */ - sctx->send_progress = pm->ino; - ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs); if (ret < 0) goto out; - } - - fs_path_free(name); - name = NULL; - - to_path = fs_path_alloc(); - if (!to_path) { - ret = -ENOMEM; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } goto out; } - - sctx->send_progress = sctx->cur_ino + 1; + fs_path_reset(name); + to_path = name; + name = NULL; ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; @@ -3202,127 +3264,74 @@ out: static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { - int ret; + int ret = 0; u64 ino = parent_ref->dir; u64 parent_ino_before, parent_ino_after; - u64 old_gen; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; int len1, len2; - int register_upper_dirs; - u64 gen; - - if (is_waiting_for_move(sctx, ino)) - return 1; - - if (parent_ref->dir <= sctx->cur_ino) - return 0; - - ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, - NULL, NULL, NULL, NULL); - if (ret == -ENOENT) - return 0; - else if (ret < 0) - return ret; - - if (parent_ref->dir_gen != old_gen) - return 0; - - path_before = fs_path_alloc(); - if (!path_before) - return -ENOMEM; - - ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, - NULL, path_before); - if (ret == -ENOENT) { - ret = 0; - goto out; - } else if (ret < 0) { - goto out; - } path_after = fs_path_alloc(); - if (!path_after) { + path_before = fs_path_alloc(); + if (!path_after || !path_before) { ret = -ENOMEM; goto out; } - ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - &gen, path_after); - if (ret == -ENOENT) { - ret = 0; - goto out; - } else if (ret < 0) { - goto out; - } - - len1 = fs_path_len(path_before); - len2 = fs_path_len(path_after); - if (parent_ino_before != parent_ino_after || len1 != len2 || - memcmp(path_before->start, path_after->start, len1)) { - ret = 1; - goto out; - } - ret = 0; - /* - * Ok, our new most direct ancestor has a higher inode number but - * wasn't moved/renamed. So maybe some of the new ancestors higher in - * the hierarchy have an higher inode number too *and* were renamed - * or moved - in this case we need to wait for the ancestor's rename - * or move operation before we can do the move/rename for the current - * inode. + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed. */ - register_upper_dirs = 0; - ino = parent_ino_after; -again: - while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { - u64 parent_gen; + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + ret = 1; + break; + } fs_path_reset(path_before); fs_path_reset(path_after); ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - &parent_gen, path_after); + NULL, path_after); if (ret < 0) goto out; ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, NULL, path_before); - if (ret == -ENOENT) { - ret = 0; - break; - } else if (ret < 0) { + if (ret < 0 && ret != -ENOENT) { goto out; + } else if (ret == -ENOENT) { + ret = 1; + break; } len1 = fs_path_len(path_before); len2 = fs_path_len(path_after); - if (parent_ino_before != parent_ino_after || len1 != len2 || - memcmp(path_before->start, path_after->start, len1)) { + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { ret = 1; - if (register_upper_dirs) { - break; - } else { - register_upper_dirs = 1; - ino = parent_ref->dir; - gen = parent_ref->dir_gen; - goto again; - } - } else if (register_upper_dirs) { - ret = add_pending_dir_move(sctx, ino, gen, - parent_ino_after); - if (ret < 0 && ret != -EEXIST) - goto out; + break; } - ino = parent_ino_after; - gen = parent_gen; } out: fs_path_free(path_before); fs_path_free(path_after); + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs); + if (!ret) + ret = 1; + } + return ret; } @@ -3483,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = add_pending_dir_move(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - cur->dir); *pending_move = 1; } else { ret = send_rename(sctx, valid_path, @@ -5487,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, - "send_in_progres unbalanced %d root %llu\n", + "send_in_progres unbalanced %d root %llu", root->send_in_progress, root->root_key.objectid); spin_unlock(&root->root_item_lock); } @@ -5515,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) /* * The subvolume must remain read-only during send, protect against - * making it RW. + * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); send_root->send_in_progress++; @@ -5575,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -5664,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) spin_lock(&sctx->parent_root->root_item_lock); sctx->parent_root->send_in_progress++; - if (!btrfs_root_readonly(sctx->parent_root)) { + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9601d25a460..4662d92a4b7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -511,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } else if (compress) { if (!btrfs_test_opt(root, COMPRESS)) btrfs_info(root->fs_info, - "btrfs: use %s compression\n", + "btrfs: use %s compression", compress_type); } break; @@ -580,8 +580,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } break; case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL root->fs_info->sb->s_flags |= MS_POSIXACL; break; +#else + btrfs_err(root->fs_info, + "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; @@ -1413,6 +1420,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. */ + cancel_work_sync(&fs_info->async_reclaim_work); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); @@ -1894,6 +1902,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_inodes(); + if (ret) + goto out; + ret = btrfs_test_qgroups(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c5eb2143dc6..df39458f148 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group_cache *block_group; - int index = kobj - sinfo->block_group_kobjs; + int index = to_raid_kobj(kobj)->raid_type; u64 val = 0; down_read(&sinfo->groups_sem); @@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = { static void release_raid_kobj(struct kobject *kobj) { - kobject_put(kobj->parent); + kfree(to_raid_kobj(kobj)); } struct kobj_type btrfs_raid_ktype = { @@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_root *root = fs_info->fs_root; int ret; - if (len >= BTRFS_LABEL_SIZE) { - pr_err("BTRFS: unable to set label with more than %d bytes\n", - BTRFS_LABEL_SIZE - 1); + if (len >= BTRFS_LABEL_SIZE) return -EINVAL; - } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) @@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj, } BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); +static ssize_t btrfs_no_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + return -EPERM; +} + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); + static struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), + BTRFS_ATTR_PTR(nodesize), + BTRFS_ATTR_PTR(sectorsize), + BTRFS_ATTR_PTR(clone_alignment), NULL, }; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 757ef00a75a..a5dcacb5df9 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" static struct vfsmount *test_mnt = NULL; @@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void) kern_unmount(test_mnt); unregister_filesystem(&test_type); } + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_NOFS); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + if (init_srcu_struct(&fs_info->subvol_srcu)) { + kfree(fs_info->fs_devices); + kfree(fs_info->super_copy); + kfree(fs_info); + return NULL; + } + + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + fs_info->running_transaction = NULL; + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_ulist = NULL; + atomic64_set(&fs_info->tree_mod_seq, 0); + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + + spin_lock(&fs_info->buffer_lock); +restart: + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + goto restart; + continue; + } + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + cleanup_srcu_struct(&fs_info->subvol_srcu); + kfree(fs_info->super_copy); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->node) + free_extent_buffer(root->node); + if (root->fs_info) + btrfs_free_dummy_fs_info(root->fs_info); + kfree(root); +} + diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 312560a9123..fd395422448 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,13 +23,18 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) +struct btrfs_root; + int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); +int btrfs_test_qgroups(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root); #else static inline int btrfs_test_free_space_cache(void) { @@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void) { return 0; } +static inline int btrfs_test_qgroups(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 397d1f99a8e..3ae0f5b8bb8 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -23,33 +23,6 @@ #include "../extent_io.h" #include "../volumes.h" -static struct btrfs_fs_info *alloc_dummy_fs_info(void) -{ - struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); - if (!fs_info) - return fs_info; - fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); - if (!fs_info->fs_devices) { - kfree(fs_info); - return NULL; - } - return fs_info; -} -static void free_dummy_root(struct btrfs_root *root) -{ - if (!root) - return; - if (root->fs_info) { - kfree(root->fs_info->fs_devices); - kfree(root->fs_info); - } - if (root->node) - free_extent_buffer(root->node); - kfree(root); -} - static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, u64 disk_len, u32 type, u8 compression, int slot) @@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void) * We do this since btrfs_get_extent wants to assign em->bdev to * root->fs_info->fs_devices->latest_bdev. */ - root->fs_info = alloc_dummy_fs_info(); + root->fs_info = btrfs_alloc_dummy_fs_info(); if (!root->fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; @@ -837,7 +810,7 @@ out: if (!IS_ERR(em)) free_extent_map(em); iput(inode); - free_dummy_root(root); + btrfs_free_dummy_root(root); return ret; } @@ -864,7 +837,7 @@ static int test_hole_first(void) goto out; } - root->fs_info = alloc_dummy_fs_info(); + root->fs_info = btrfs_alloc_dummy_fs_info(); if (!root->fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; @@ -934,7 +907,7 @@ out: if (!IS_ERR(em)) free_extent_map(em); iput(inode); - free_dummy_root(root); + btrfs_free_dummy_root(root); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 00000000000..fa691b754aa --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,468 @@ +/* + * Copyright (C) 2013 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + init_dummy_trans(&trans); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_msg("Couldn't insert ref %d\n", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 1); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_msg("Failed to insert backref\n"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + path->leave_spinning = 1; + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Didn't find our key %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Couldn't find backref %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup basic add\n"); + ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_item(root, 4096, 4096); + if (ret) + return -EINVAL; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_EXCL, 0); + if (ret) { + test_msg("Couldn't remove space from the qgroup %d\n", ret); + return -EINVAL; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup multiple refs test\n"); + + /* We have 5 created already from the previous test */ + ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = add_tree_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(void) +{ + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + return PTR_ERR(root); + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + root->alloc_bytenr += 8192; + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 5; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 256; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to addt he root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; + + test_msg("Running qgroup tests\n"); + ret = test_no_shared_qgroup(root); + if (ret) + goto out; + ret = test_multiple_refs(root); +out: + btrfs_free_dummy_root(root); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7579f6d0b85..9630f10f8e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@ #include "inode-map.h" #include "volumes.h" #include "dev-replace.h" +#include "qgroup.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -241,18 +242,19 @@ loop: static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (root->ref_cows && root->last_trans < trans->transid) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); /* - * see below for in_trans_setup usage rules + * see below for IN_TRANS_SETUP usage rules * we have the reloc mutex held now, so there * is only one writer in this function */ - root->in_trans_setup = 1; + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); - /* make sure readers find in_trans_setup before + /* make sure readers find IN_TRANS_SETUP before * they find our root->last_trans update */ smp_wmb(); @@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * But, we have to set root->last_trans before we * init the relocation root, otherwise, we trip over warnings * in ctree.c. The solution used here is to flag ourselves - * with root->in_trans_setup. When this is 1, we're still + * with root IN_TRANS_SETUP. When this is 1, we're still * fixing up the reloc trees and everyone must wait. * * When this is zero, they can trust root->last_trans and fly @@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * done before we pop in the zero below */ btrfs_init_reloc_root(trans, root); - smp_wmb(); - root->in_trans_setup = 0; + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return 0; } @@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; /* - * see record_root_in_trans for comments about in_trans_setup usage + * see record_root_in_trans for comments about IN_TRANS_SETUP usage * and barriers */ smp_rmb(); if (root->last_trans == trans->transid && - !root->in_trans_setup) + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; mutex_lock(&root->fs_info->reloc_mutex); @@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) static inline bool need_reserve_reloc_root(struct btrfs_root *root) { if (!root->fs_info->reloc_ctl || - !root->ref_cows || + !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; @@ -695,6 +697,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; + int must_run_delayed_refs = 0; if (trans->use_count > 1) { trans->use_count--; @@ -702,14 +705,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } - /* - * do the qgroup accounting as early as possible - */ - err = btrfs_delayed_refs_qgroup_accounting(trans, info); - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + trans->delayed_ref_updates = 0; + if (!trans->sync) { + must_run_delayed_refs = + btrfs_should_throttle_delayed_refs(trans, root); + cur = max_t(unsigned long, cur, 32); + + /* + * don't make the caller wait if they are from a NOLOCK + * or ATTACH transaction, it will deadlock with commit + */ + if (must_run_delayed_refs == 1 && + (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) + must_run_delayed_refs = 2; + } + if (trans->qgroup_reserved) { /* * the same root has to be passed here between start_transaction @@ -719,16 +735,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->qgroup_reserved = 0; } - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); - - trans->delayed_ref_updates = 0; - if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { - cur = max_t(unsigned long, cur, 32); - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -778,6 +784,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (must_run_delayed_refs) { + btrfs_async_run_delayed_refs(root, cur, + must_run_delayed_refs == 1); + } return err; } @@ -1049,8 +1059,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); /* see comments in should_cow_block() */ - root->force_cow = 0; - smp_wmb(); + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); if (root->commit_root != root->node) { list_add_tail(&root->dirty_list, @@ -1081,7 +1091,7 @@ int btrfs_defrag_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - if (xchg(&root->defrag_running, 1)) + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) return 0; while (1) { @@ -1104,7 +1114,7 @@ int btrfs_defrag_root(struct btrfs_root *root) break; } } - root->defrag_running = 0; + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); return ret; } @@ -1168,12 +1178,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto no_free_objectid; } - pending->error = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (pending->error) - goto no_free_objectid; - key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1270,8 +1274,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * We need to flush delayed refs in order to make sure all of our quota + * operations have been done before we call btrfs_qgroup_inherit. + */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + pending->error = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (pending->error) + goto no_free_objectid; + /* see comments in should_cow_block() */ - root->force_cow = 1; + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); btrfs_set_root_node(new_root_item, tmp); @@ -1598,12 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, * them now so that they hinder processing of more delayed refs * as little as possible. */ - if (ret) { - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - return ret; - } - - ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); if (ret) return ret; @@ -1984,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - /* - * Make sure root is not involved in send, - * if we fail with first root, we return - * directly rather than continue. - */ - spin_lock(&root->root_item_lock); - if (root->send_in_progress) { - spin_unlock(&fs_info->trans_lock); - spin_unlock(&root->root_item_lock); - return 0; - } - spin_unlock(&root->root_item_lock); - list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b57b924e8e0..7dd558ed071 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -69,6 +69,7 @@ struct btrfs_transaction { #define __TRANS_ATTACH (1U << 10) #define __TRANS_JOIN (1U << 11) #define __TRANS_JOIN_NOLOCK (1U << 12) +#define __TRANS_DUMMY (1U << 13) #define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 76928ca9774..a63719cc957 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (root->ref_cows == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; if (btrfs_test_opt(root, SSD)) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2f45fc0261..9e1f2cd5e67 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,13 +20,11 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> -#include "ctree.h" -#include "transaction.h" +#include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "tree-log.h" #include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: @@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } - if (!root->log_start_pid) { root->log_start_pid = current->pid; - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } else if (root->log_start_pid != current->pid) { - root->log_multiple_pids = true; + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } atomic_inc(&root->log_batch); @@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ret) goto out; } - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); @@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { + if (!btrfs_test_opt(root, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); goto out; } @@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); + if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); @@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } @@ -2886,7 +2878,7 @@ fail: out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, root, ret); @@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, root, ret); @@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * make sure any commits to the log are forced * to be full commits */ - root->fs_info->last_trans_log_full_commit = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; break; } @@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + /* + * The prev transaction commit doesn't complete, we need do + * full commit by ourselves. + */ if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; @@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 91b145fce33..7f5b41bd537 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +#include "ctree.h" +#include "transaction.h" + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) INIT_LIST_HEAD(&ctx->list); } +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + trans->transid; +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49d7fab7336..ffeed6d6326 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1452,6 +1452,22 @@ out: return ret; } +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ + struct file *filp; + + filp = filp_open(path_name, O_RDWR, 0); + if (!filp) + return; + file_update_time(filp); + filp_close(filp, NULL); + return; +} + static int btrfs_rm_dev_item(struct btrfs_root *root, struct btrfs_device *device) { @@ -1674,11 +1690,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) struct btrfs_fs_devices *fs_devices; fs_devices = root->fs_info->fs_devices; while (fs_devices) { - if (fs_devices->seed == cur_devices) + if (fs_devices->seed == cur_devices) { + fs_devices->seed = cur_devices->seed; break; + } fs_devices = fs_devices->seed; } - fs_devices->seed = cur_devices->seed; cur_devices->seed = NULL; lock_chunks(root); __btrfs_close_devices(cur_devices); @@ -1694,20 +1711,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * remove it from the devices list and zero out the old super */ if (clear_super && disk_super) { + u64 bytenr; + int i; + /* make sure this device isn't detected as part of * the FS anymore */ memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + + /* clear the mirror copies of super block on the disk + * being removed, 0th copy is been taken care above and + * the below would take of the rest + */ + for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + + brelse(bh); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + continue; + } + memset(&disk_super->magic, 0, + sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } } ret = 0; - /* Notify udev that device has changed */ - if (bdev) + if (bdev) { + /* Notify udev that device has changed */ btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + } + error_brelse: brelse(bh); if (bdev) @@ -1883,7 +1935,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; - fs_devices->total_devices = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2146,6 +2197,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_commit_transaction(trans, root); } + /* Update ctime/mtime for libblkid */ + update_dev_time(device_path); return ret; error_trans: @@ -2922,6 +2975,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } + return 1; } @@ -2944,6 +3007,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -2982,6 +3048,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) memset(&bctl->stat, 0, sizeof(bctl->stat)); spin_unlock(&fs_info->balance_lock); again: + if (!counting) { + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -3881,7 +3952,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, u8 *ptr; array_size = btrfs_super_sys_array_size(super_copy); - if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; @@ -3986,6 +4058,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ + - sizeof(struct btrfs_item) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 start, u64 type) @@ -4035,6 +4117,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) @@ -4042,11 +4126,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { - btrfs_err(info, "invalid chunk type 0x%llx requested\n", + btrfs_err(info, "invalid chunk type 0x%llx requested", type); BUG_ON(1); } @@ -4294,7 +4382,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, if (em->start != chunk_offset || em->len != chunk_size) { btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" - " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + " %Lu-%Lu, found %Lu-%Lu", chunk_offset, chunk_size, em->start, em->len); free_extent_map(em); return -EINVAL; @@ -4496,14 +4584,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * and exit, so return 1 so the callers don't try to use other copies. */ if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, + btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, logical+len); return 1; } if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " - "%Lu-%Lu\n", logical, logical+len, em->start, + "%Lu-%Lu", logical, logical+len, em->start, em->start + em->len); free_extent_map(em); return 1; @@ -4684,7 +4772,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " - "found %Lu-%Lu\n", logical, em->start, + "found %Lu-%Lu", logical, em->start, em->start + em->len); free_extent_map(em); return -EINVAL; @@ -6058,10 +6146,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->dev_root = fs_info->dev_root; - mutex_unlock(&fs_devices->device_list_mutex); + while (fs_devices) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); + + fs_devices = fs_devices->seed; + } } static void __btrfs_reset_dev_stats(struct btrfs_device *dev) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 80754f9dd3d..1a15bbeb65e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -255,6 +255,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) /* * Profile changing flags. When SOFT is set we won't relocate chunk if diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8e57191950c..4f196314c0c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws, if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "BTRFS: deflateInit failed\n"); - ret = -1; + ret = -EIO; goto out; } @@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws, out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws, printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); - ret = -1; + ret = -EIO; goto out; } @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -1; + ret = -EIO; goto out; } /* we need another page for writing out. Test this @@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws, zlib_deflateEnd(&workspace->def_strm); if (ret != Z_STREAM_END) { - ret = -1; + ret = -EIO; goto out; } if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { - ret = -1; + ret = -E2BIG; goto out; } @@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); - return -1; + return -EIO; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); @@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } } if (ret != Z_STREAM_END) - ret = -1; + ret = -EIO; else ret = 0; done: @@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); - return -1; + return -EIO; } while (bytes_left > 0) { @@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, total_out = workspace->inf_strm.total_out; if (total_out == buf_start) { - ret = -1; + ret = -EIO; break; } @@ -382,7 +382,7 @@ next: } if (ret != Z_STREAM_END && bytes_left != 0) - ret = -1; + ret = -EIO; else ret = 0; |