diff options
Diffstat (limited to 'fs/btrfs')
53 files changed, 4355 insertions, 1577 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f341a98031d..6d1d0b93b1a 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index ff9b3995d45..9a0124a9585 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, const char *name; char *value = NULL; - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - return ret; - ret = 0; - } - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 10db21fa092..e25564bfcb4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -900,7 +900,11 @@ again: goto out; BUG_ON(ret == 0); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else if (trans) { +#endif /* * look if there are updates for this ref queued and lock the * head @@ -984,11 +988,12 @@ again: goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list) { + if (extent_item_pos && !ref->inode_list && + ref->level == 0) { u32 bsz; struct extent_buffer *eb; bsz = btrfs_level_size(fs_info->extent_root, - info_level); + ref->level); eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { @@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, * returns <0 on error */ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - struct btrfs_extent_inline_ref **out_eiref, - int *out_type) + struct btrfs_key *key, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) { unsigned long end; u64 flags; @@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, /* first call */ flags = btrfs_extent_flags(eb, ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - info = (struct btrfs_tree_block_info *)(ei + 1); - *out_eiref = - (struct btrfs_extent_inline_ref *)(info + 1); + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } } else { *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); } *ptr = (unsigned long)*out_eiref; - if ((void *)*ptr >= (void *)ei + item_size) + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) return -ENOENT; } end = (unsigned long)ei + item_size; - *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, * <0 on error. */ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level) + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) { int ret; int type; @@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 1; while (1) { - ret = __get_extent_inline_ref(ptr, eb, ei, item_size, - &eiref, &type); + ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); if (ret < 0) return ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a910b27a8ad..86fc20fec28 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level); + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, @@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c9a24444ec9..4794923c410 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0e8388e72d8..ce92ae30250 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error: next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) { + sf->error = -1; btrfsic_release_block_ctx( &sf-> next_block_ctx); @@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame: sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) + if (NULL == next_stack) { + sf->error = -1; goto one_stack_frame_backwards; + } next_stack->i = -1; next_stack->block = sf->next_block; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d43c544d3b6..1daea0b4718 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; @@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -1; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1bcfcdb23cf..aeab453b8e2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) static void add_root_to_dirty_list(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { + if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && + list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } @@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int level; struct btrfs_disk_key disk_key; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) @@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) } /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. - */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ - u64 seq = atomic64_read(&fs_info->tree_mod_seq); - seq &= 0xffffffff00000000ull; - seq += 1ull << 32; - atomic64_set(&fs_info->tree_mod_seq, seq); - return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. + * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ - return (seq & 0xffffffff00000000ull) - 1ull; -} - -/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - u64 seq; - tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); + elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); - return seq; + return elem->seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) BUG_ON(!tm); - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) return ret; @@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif /* ensure we can see the force_cow */ smp_rmb(); @@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } @@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) return ret; btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); - if (ret < 0) + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) return 0; return 1; } @@ -5736,6 +5718,24 @@ again: ret = 0; goto done; } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ba6b88528dc..be91397f4e9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/workqueue.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -756,6 +757,12 @@ struct btrfs_dir_item { #define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -840,7 +847,10 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - __le64 unused[8]; + /* BTRFS_BALANCE_ARGS_LIMIT value */ + __le64 limit; + + __le64 unused[7]; } __attribute__ ((__packed__)); /* @@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item { __le64 rsv_excl; } __attribute__ ((__packed__)); +/* For raid type sysfs entries */ +struct raid_kobject { + int raid_type; + struct kobject kobj; +}; + struct btrfs_space_info { spinlock_t lock; @@ -1163,7 +1179,7 @@ struct btrfs_space_info { wait_queue_head_t wait; struct kobject kobj; - struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 @@ -1243,11 +1259,19 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 sectorsize; u64 cache_generation; + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; @@ -1313,6 +1337,8 @@ struct btrfs_stripe_hash_table { #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +void btrfs_init_async_reclaim_work(struct work_struct *work); + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1534,6 +1560,9 @@ struct btrfs_fs_info { */ struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; + + /* the extent workers do delayed refs on the extent allocation tree */ + struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; @@ -1636,7 +1665,10 @@ struct btrfs_fs_info { /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; + struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; + spinlock_t qgroup_op_lock; + atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1688,6 +1720,9 @@ struct btrfs_fs_info { struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; }; struct btrfs_subvolume_writers { @@ -1696,6 +1731,26 @@ struct btrfs_subvolume_writers { }; /* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS 1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX 3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 + +/* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ @@ -1706,6 +1761,7 @@ struct btrfs_root { struct btrfs_root *log_root; struct btrfs_root *reloc_root; + unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1740,7 +1796,6 @@ struct btrfs_root { /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; - bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1760,23 +1815,13 @@ struct btrfs_root { u64 highest_objectid; - /* btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So in_trans_setup - * is used to tell us when more checks are required - */ - unsigned long in_trans_setup; - int ref_cows; - int track_dirty; - int in_radix; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - int dummy_root; + u64 alloc_bytenr; #endif + u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - int defrag_running; char *name; /* the dirty list is only used by non-reference counted roots */ @@ -1790,7 +1835,6 @@ struct btrfs_root { spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; - int orphan_item_inserted; int orphan_cleanup_state; spinlock_t inode_lock; @@ -1808,8 +1852,6 @@ struct btrfs_root { */ dev_t anon_dev; - int force_cow; - spinlock_t root_item_lock; atomic_t refs; @@ -2788,6 +2830,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); @@ -2897,6 +2944,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); } static inline void @@ -2914,6 +2962,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); } /* struct btrfs_super_block */ @@ -3236,6 +3285,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3273,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data); + struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, @@ -3285,9 +3336,10 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow); + u64 owner, u64 offset, int no_quota); -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, + int delalloc); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@ -3297,7 +3349,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow); + u64 root_objectid, u64 owner, u64 offset, int no_quota); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3385,7 +3437,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); - int btrfs_start_nocow_write(struct btrfs_root *root); void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ @@ -3561,7 +3612,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); -u64 btrfs_tree_mod_seq_prev(u64 seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ @@ -3708,6 +3758,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; @@ -4069,52 +4125,6 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* qgroup.c */ -struct qgroup_update { - struct list_head list; - struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op; -}; - -int btrfs_quota_enable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_quota_disable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); -void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - struct btrfs_qgroup_limit *limit); -int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); -void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, - struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -4131,6 +4141,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); #endif #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 33e561a8401..da775bfdebc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -149,8 +149,8 @@ again: spin_lock(&root->inode_lock); ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); if (ret == -EEXIST) { - kmem_cache_free(delayed_node_cache, node); spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); radix_tree_preload_end(); goto again; } @@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (atomic_dec_and_test(&delayed_node->refs)) { + bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); if (atomic_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); - kmem_cache_free(delayed_node_cache, delayed_node); + free = true; } spin_unlock(&root->inode_lock); + if (free) + kmem_cache_free(delayed_node_cache, delayed_node); } } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 31299646024..6d16bea94e1 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + if (ref1->no_quota > ref2->no_quota) + return 1; + if (ref1->no_quota < ref2->no_quota) + return -1; /* merging of sequenced refs is not allowed */ if (compare_seq) { if (ref1->seq < ref2->seq) @@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int for_cow) + int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ @@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int for_cow) + u64 offset, int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; @@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, - for_cow); + no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, for_cow); + action, no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4ba9b93022f..a764e2340d4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - -/* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9f2290509ac..eea26e1b2fd 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -313,7 +314,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, RAID56)) { btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); - return -EINVAL; + return -EOPNOTSUPP; } switch (args->start.cont_reading_from_srcdev_mode) { @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 983314932af..08e65e9cf2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@ #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -368,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -1109,6 +1111,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return alloc_test_extent_buffer(root->fs_info, bytenr, + blocksize); +#endif return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1201,10 +1208,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->nodesize = nodesize; root->leafsize = leafsize; root->stripesize = stripesize; - root->ref_cows = 0; - root->track_dirty = 0; - root->in_radix = 0; - root->orphan_item_inserted = 0; + root->state = 0; root->orphan_cleanup_state = 0; root->objectid = objectid; @@ -1265,7 +1269,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, else root->defrag_trans_start = 0; init_completion(&root->kobj_unregister); - root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1290,7 +1293,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) if (!root) return ERR_PTR(-ENOMEM); __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); - root->dummy_root = 1; + set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + root->alloc_bytenr = 0; return root; } @@ -1341,8 +1345,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); - root->track_dirty = 1; - + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); root->root_item.flags = 0; root->root_item.byte_limit = 0; @@ -1371,6 +1374,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, fail: if (leaf) { btrfs_tree_unlock(leaf); + free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } kfree(root); @@ -1396,13 +1400,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * DON'T set REF_COWS for log trees + * * log trees do not get reference counted because they go away * before a real commit is actually done. They do store pointers * to file data extents, and those reference counts still get * updated (along with back refs to the log tree). */ - root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, @@ -1536,7 +1542,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, return root; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - root->ref_cows = 1; + set_bit(BTRFS_ROOT_REF_COWS, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1606,7 +1612,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret == 0) - root->in_radix = 1; + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); @@ -1662,7 +1668,7 @@ again: if (ret < 0) goto fail; if (ret == 0) - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { @@ -2064,6 +2070,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->extent_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2090,7 +2097,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->chunk_root); } -static void del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2101,7 +2108,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (gang[0]->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); @@ -2221,6 +2228,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); @@ -2246,6 +2254,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; @@ -2291,6 +2300,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2354,6 +2364,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; @@ -2577,6 +2588,10 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("readahead", flags, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->submit_workers && fs_info->flush_workers && @@ -2586,6 +2601,7 @@ int open_ctree(struct super_block *sb, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->fixup_workers && fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; @@ -2693,7 +2709,7 @@ retry_root_backup: ret = PTR_ERR(extent_root); goto recovery_tree_root; } - extent_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); fs_info->extent_root = extent_root; location.objectid = BTRFS_DEV_TREE_OBJECTID; @@ -2702,7 +2718,7 @@ retry_root_backup: ret = PTR_ERR(dev_root); goto recovery_tree_root; } - dev_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); fs_info->dev_root = dev_root; btrfs_init_devices_late(fs_info); @@ -2712,13 +2728,13 @@ retry_root_backup: ret = PTR_ERR(csum_root); goto recovery_tree_root; } - csum_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); fs_info->csum_root = csum_root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; quota_root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(quota_root)) { - quota_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; fs_info->quota_root = quota_root; @@ -2733,7 +2749,7 @@ retry_root_backup: create_uuid_tree = true; check_uuid_tree = false; } else { - uuid_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); fs_info->uuid_root = uuid_root; create_uuid_tree = false; check_uuid_tree = @@ -2889,7 +2905,9 @@ retry_root_backup: if (ret) goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); @@ -2966,7 +2984,7 @@ fail_qgroup: fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); btrfs_cleanup_transaction(fs_info->tree_root); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -3501,8 +3519,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log(NULL, root); - __btrfs_remove_free_space_cache(root->free_ino_pinned); - __btrfs_remove_free_space_cache(root->free_ino_ctl); + if (root->free_ino_pinned) + __btrfs_remove_free_space_cache(root->free_ino_pinned); + if (root->free_ino_ctl) + __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); } @@ -3533,28 +3553,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i; - int ret; + int i = 0; + int err = 0; + unsigned int ret = 0; + int index; while (1) { + index = srcu_read_lock(&fs_info->subvol_srcu); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) + if (!ret) { + srcu_read_unlock(&fs_info->subvol_srcu, index); break; - + } root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { - int err; + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_fs_root(gang[i]); + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - return err; + break; + btrfs_put_fs_root(gang[i]); } root_objectid++; } - return 0; + + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_fs_root(gang[i]); + } + return err; } int btrfs_commit_super(struct btrfs_root *root) @@ -3603,6 +3646,8 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); + if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) @@ -3627,12 +3672,17 @@ int close_ctree(struct btrfs_root *root) btrfs_sysfs_remove_one(fs_info); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); free_root_pointers(fs_info, 1); @@ -3709,6 +3759,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, buf->len, root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(root, buf); + ASSERT(0); + } +#endif } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 53059df350f..23ce3ceba0a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5590af92094..813537f362f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,16 +26,16 @@ #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" #include "sysfs.h" +#include "qgroup.h" #undef SCRAMBLE_DELAYED_REFS @@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_extent_op *extra_op, + int no_quota); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + int level, struct btrfs_key *ins, + int no_quota); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 num_bytes, int reserve, + int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -1271,7 +1274,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1307,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1764,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1808,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1839,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); update_inline_extent_backref(root, path, iref, - refs_to_add, extent_op); + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, @@ -1872,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { update_inline_extent_backref(root, path, iref, - -refs_to_drop, NULL); + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; @@ -1946,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1958,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } return ret; } @@ -1973,31 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, + int no_quota, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; u64 refs; int ret; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret != -EAGAIN) + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) goto out; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); + + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + goto out; + } + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2005,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } + path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, @@ -2041,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) @@ -2056,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + node->no_quota, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + extent_op, node->no_quota); } else { BUG(); } @@ -2199,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; ins.objectid = node->bytenr; if (skinny_metadata) { @@ -2218,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins); + ref->level, &ins, + node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, node->no_quota, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, extent_op, + node->no_quota); } else { BUG(); } @@ -2574,42 +2623,6 @@ static u64 find_middle(struct rb_root *root) } #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct qgroup_update *qgroup_update; - int ret = 0; - - if (list_empty(&trans->qgroup_ref_list) != - !trans->delayed_ref_elem.seq) { - /* list without seq or seq without list */ - btrfs_err(fs_info, - "qgroup accounting update error, list is%s empty, seq is %#x.%x", - list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); - } - - if (!trans->delayed_ref_elem.seq) - return 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - qgroup_update = list_first_entry(&trans->qgroup_ref_list, - struct qgroup_update, list); - list_del(&qgroup_update->list); - if (!ret) - ret = btrfs_qgroup_account_ref( - trans, fs_info, qgroup_update->node, - qgroup_update->extent_op); - kfree(qgroup_update); - } - - btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); - - return ret; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2662,15 +2675,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); u64 avg_runtime; + u64 val; smp_mb(); avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; if (num_entries * avg_runtime >= NSEC_PER_SEC) return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; return btrfs_check_space_for_delayed_refs(trans, root); } +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, delayed_ref_async_start, + NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2698,8 +2790,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - delayed_refs = &trans->transaction->delayed_refs; if (count == 0) { count = atomic_read(&delayed_refs->num_entries) * 2; @@ -2758,6 +2848,9 @@ again: goto again; } out: + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2964,7 +3057,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc, int no_quota) { u64 bytenr; u64 num_bytes; @@ -2979,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -3014,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset, no_quota); if (ret) goto fail; } else { @@ -3022,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - for_cow); + no_quota); if (ret) goto fail; } @@ -3033,15 +3130,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3164,7 +3261,8 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root, SPACE_CACHE) || + block_group->delalloc_bytes) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3401,10 +3499,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); - kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); - } init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -4204,6 +4300,104 @@ static int flush_space(struct btrfs_root *root, return ret; } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + return (used >= div_factor_fine(space_info->total_bytes, 98) && + !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info) +{ + u64 used; + + spin_lock(&space_info->lock); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + return; + } while (flush_state <= COMMIT_TRANS); + + if (btrfs_need_do_async_reclaim(space_info, fs_info)) + queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -4311,8 +4505,13 @@ again: if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + if (need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } - spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4369,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) block_rsv = trans->block_rsv; if (root == root->fs_info->csum_root && trans->adding_csums) @@ -5416,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space, or by somebody who is * freeing space that was never actually used on disk. For example if you @@ -5434,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * succeeds. */ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) + u64 num_bytes, int reserve, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; @@ -5453,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, num_bytes, 0); space_info->bytes_may_use -= num_bytes; } + + if (delalloc) + cache->delalloc_bytes += num_bytes; } } else { if (cache->ro) space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -5472,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5495,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5535,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -5621,7 +5824,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_key key; struct btrfs_path *path; @@ -5637,9 +5841,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5687,7 +5896,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5806,7 +6015,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { btrfs_err(info, "trying to drop %d refs but we only have %Lu " - "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5814,6 +6023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -5829,7 +6039,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5850,6 +6060,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -5872,6 +6083,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; + + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); + } out: btrfs_free_path(path); return ret; @@ -5987,7 +6212,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6008,11 +6233,15 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) + u64 owner, u64 offset, int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6028,13 +6257,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); + NULL, no_quota); } return ret; } @@ -6142,6 +6371,70 @@ enum btrfs_loop_type { LOOP_NO_EMPTY_SIZE = 3, }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -6156,7 +6449,7 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, - u64 flags) + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -6244,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, up_read(&space_info->groups_sem); } else { index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } } else if (block_group) { @@ -6258,7 +6552,7 @@ search: u64 offset; int cached; - btrfs_get_block_group(block_group); + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; /* @@ -6306,16 +6600,16 @@ have_block_group: * the refill lock keeps out other * people trying to start a new cluster */ - spin_lock(&last_ptr->refill_lock); - used_block_group = last_ptr->block_group; - if (used_block_group != block_group && - (!used_block_group || - used_block_group->ro || - !block_group_bits(used_block_group, flags))) + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) goto refill_cluster; - if (used_block_group != block_group) - btrfs_get_block_group(used_block_group); + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; offset = btrfs_alloc_from_cluster(used_block_group, last_ptr, @@ -6329,16 +6623,15 @@ have_block_group: used_block_group, search_start, num_bytes); if (used_block_group != block_group) { - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, + delalloc); block_group = used_block_group; } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); -refill_cluster: +release_cluster: /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6355,8 +6648,10 @@ refill_cluster: * succeeding in the unclustered * allocation. */ if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { + used_block_group != block_group) { spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(used_block_group, + delalloc); goto unclustered_alloc; } @@ -6366,6 +6661,10 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: if (loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; @@ -6473,7 +6772,7 @@ checks: BUG_ON(offset > search_start); ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type); + alloc_type, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -6485,13 +6784,13 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); } up_read(&space_info->groups_sem); @@ -6514,8 +6813,14 @@ loop: loop++; if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; @@ -6532,7 +6837,8 @@ loop: root, ret); else ret = 0; - btrfs_end_transaction(trans, root); + if (!exist) + btrfs_end_transaction(trans, root); if (ret) goto out; } @@ -6597,7 +6903,7 @@ again: int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data) + struct btrfs_key *ins, int is_data, int delalloc) { bool final_tried = false; u64 flags; @@ -6607,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags); + flags, delalloc); if (ret == -ENOSPC) { if (!final_tried && ins->offset) { @@ -6632,7 +6938,8 @@ again: } static int __btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len, int pin) + u64 start, u64 len, + int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -6651,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, pin_down_extent(root, cache, start, len, 1); else { btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } btrfs_put_block_group(cache); @@ -6661,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, } int btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) + u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(root, start, len, 0); + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); } int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { - return __btrfs_free_reserved_extent(root, start, len, 1); + return __btrfs_free_reserved_extent(root, start, len, 1, 0); } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -6733,6 +7040,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6747,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + int level, struct btrfs_key *ins, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6757,6 +7072,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6790,6 +7106,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->leafsize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -6811,6 +7128,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6866,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, return -EINVAL; ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT); + RESERVE_ALLOC_NO_ACCOUNT, 0); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -6994,12 +7319,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + blocksize, level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, - empty_size, hint, &ins, 0); + empty_size, hint, &ins, 0, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -7735,7 +8069,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (root->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); @@ -8327,8 +8661,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; - kobj = &space_info->block_group_kobjs[i]; - if (kobj->parent) { + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { kobject_del(kobj); kobject_put(kobj); } @@ -8352,17 +8687,26 @@ static void __link_block_group(struct btrfs_space_info *space_info, up_write(&space_info->groups_sem); if (first) { - struct kobject *kobj = &space_info->block_group_kobjs[index]; + struct raid_kobject *rkobj; int ret; - kobject_get(&space_info->kobj); /* put in release */ - ret = kobject_add(kobj, &space_info->kobj, "%s", - get_raid_name(index)); + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); if (ret) { - pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); - kobject_put(&space_info->kobj); + kobject_put(&rkobj->kobj); + goto out_err; } + space_info->block_group_kobjs[index] = &rkobj->kobj; } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); } static struct btrfs_block_group_cache * @@ -8392,6 +8736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) start); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->new_bg_list); @@ -8611,7 +8956,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) @@ -8697,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; struct inode *inode; + struct kobject *kobj = NULL; int ret; int index; int factor; @@ -8796,11 +9142,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, */ list_del_init(&block_group->list); if (list_empty(&block_group->space_info->block_groups[index])) { - kobject_del(&block_group->space_info->block_group_kobjs[index]); - kobject_put(&block_group->space_info->block_group_kobjs[index]); + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(root->fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475cee..a389820d158 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1693,6 +1693,7 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { max_bytes = PAGE_CACHE_SIZE; loops = 1; @@ -2353,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; - int ret; + int ret = 0; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); } return 0; } @@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page, } /* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) +{ + struct extent_io_tree *tree = epd->tree; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) { - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); - u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; - u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - int ret; - int nr = 0; size_t pg_offset = 0; size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - bool fill_delalloc = true; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; - else - write_flags = WRITE; - - trace___extent_writepage(page, inode, wbc); - - WARN_ON(!PageLocked(page)); - - ClearPageError(page); - - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - if (!tree->ops || !tree->ops->fill_delalloc) - fill_delalloc = false; - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked && fill_delalloc) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - &nr_written); - /* File system has been set read-only */ - if (ret) { - SetPageError(page); - goto done; - } - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } + int ret = 0; + int nr = 0; + bool compressed; - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); @@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, wbc->pages_skipped++; else redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - ret = 0; + ret = 1; goto done_unlocked; } } @@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (last_byte <= start) { + if (i_size <= start) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, blocksize = inode->i_sb->s_blocksize; while (cur <= end) { - if (cur >= last_byte) { + u64 em_end; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); break; } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; @@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; - continue; - } if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) { SetPageError(page); } else { - unsigned long max_nr = end_index + 1; + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { @@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } unlock_page(page); + return ret; done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); return 0; } @@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct extent_page_data *epd) +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) { unsigned long i, num_pages; int flush = 0; @@ -3458,7 +3523,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, static void end_extent_buffer_writeback(struct extent_buffer *eb) { clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } @@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) bio_put(bio); } -static int write_one_eb(struct extent_buffer *eb, +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) @@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; + int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -3776,8 +3842,8 @@ retry: unlock_page(page); ret = 0; } - if (ret) - done = 1; + if (!err && ret < 0) + err = ret; /* * the filesystem may choose to bump up nr_to_write. @@ -3789,7 +3855,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned && !done && !err) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -3799,7 +3865,7 @@ retry: goto retry; } btrfs_add_delayed_iput(inode); - return ret; + return err; } static void flush_epd_write_bio(struct extent_page_data *epd) @@ -4510,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) { unsigned long num_pages, i; @@ -4519,7 +4586,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); + if (p != accessed) + mark_page_accessed(p); } } @@ -4533,7 +4601,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + mark_extent_buffer_accessed(eb, NULL); return eb; } rcu_read_unlock(); @@ -4541,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(start, len); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { @@ -4581,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, spin_unlock(&mapping->private_lock); unlock_page(p); page_cache_release(p); - mark_extent_buffer_accessed(exists); + mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4596,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); - mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -4954,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c488b45237b..ccc264e7bde 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -158,7 +158,6 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; - wait_queue_head_t lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; @@ -304,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, + unsigned long start, + unsigned long len); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, @@ -350,5 +352,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); #endif #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1874aee69c8..225302b39af 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em) if (atomic_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + kfree(em->bdev); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index e7fd8a56a14..b2991fd8583 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -15,6 +15,7 @@ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ +#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 127555b29f5..f46cfe45d68 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; + bio_index += count; while (count--) { disk_bytenr += bvec->bv_len; offset += bvec->bv_len; - bio_index++; bvec++; } } @@ -750,7 +750,7 @@ again: int slot = path->slots[0] + 1; /* we didn't find a csum item, insert one */ nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems - 1) { + if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); if (ret == 1) found_next = 1; @@ -885,3 +885,79 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em->bdev = root->fs_info->fs_devices->latest_bdev; + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } + + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + } else { + btrfs_err(root->fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, root %llu", + type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ae6af072b63..1f2b99cb55e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -40,6 +40,7 @@ #include "tree-log.h" #include "locking.h" #include "volumes.h" +#include "qgroup.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -447,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, write_bytes -= copied; total_copied += copied; - /* Return to btrfs_file_aio_write to fault page */ + /* Return to btrfs_file_write_iter to fault page */ if (unlikely(copied == 0)) break; @@ -470,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -714,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; int modify_tree = -1; - int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int update_refs; int found = 0; int leafs_visited = 0; @@ -724,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -780,6 +784,18 @@ next_slot: extent_end = search_start; } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; @@ -835,7 +851,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset, 1); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -893,6 +909,7 @@ next_slot: * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -1191,7 +1208,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset, 1); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1658,27 +1675,22 @@ again: } static ssize_t __btrfs_direct_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count, size_t ocount) + struct iov_iter *from, + loff_t pos) { struct file *file = iocb->ki_filp; - struct iov_iter i; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - count, ocount); + written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) + if (written < 0 || !iov_iter_count(from)) return written; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = __btrfs_buffered_write(file, &i, pos); + written_buffered = __btrfs_buffered_write(file, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1713,9 +1725,8 @@ static void update_time_for_write(struct inode *inode) inode_inc_iversion(inode); } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1724,18 +1735,12 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; - size_t count, ocount; + size_t count = iov_iter_count(from); bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + loff_t pos = iocb->ki_pos; mutex_lock(&inode->i_mutex); - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - count = ocount; - current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { @@ -1748,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } + iov_iter_truncate(from, count); + err = file_remove_suid(file); if (err) { mutex_unlock(&inode->i_mutex); @@ -1789,14 +1796,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, count, ocount); + num_written = __btrfs_direct_write(iocb, from, pos); } else { - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, num_written); - - num_written = __btrfs_buffered_write(file, &i, pos); + num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) iocb->ki_pos = pos + num_written; } @@ -2009,8 +2011,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) + if (ret) { + btrfs_end_transaction(trans, root); goto out; + } } ret = btrfs_commit_transaction(trans, root); } else { @@ -2168,6 +2172,37 @@ out: return 0; } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2175,25 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); - u64 lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - u64 cur_offset = lockstart; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; int ret = 0; int err = 0; int rsv_count; - bool same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); - u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + u64 ino_size; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) return ret; mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. @@ -2205,8 +2257,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (same_page && len < PAGE_CACHE_SIZE) { if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); - mutex_unlock(&inode->i_mutex); - return ret; + goto out_only_mutex; } /* zero back part of the first page */ @@ -2218,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } } - /* zero the front end of the last page */ - if (offset + len < ino_size) { - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } } } @@ -2249,9 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if ((!ordered || (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_UPTODATE, 0, - cached_state)) { + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -2299,6 +2375,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) BUG_ON(ret); trans->block_rsv = rsv; + cur_offset = lockstart; + len = lockend - cur_offset; while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, @@ -2339,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) rsv, min_size); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } } if (ret) { @@ -2347,7 +2433,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - if (cur_offset < ino_size) { + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { err = ret; @@ -2372,6 +2463,7 @@ out_free: out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_only_mutex: mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; @@ -2633,11 +2725,11 @@ out: const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, - .aio_write = btrfs_file_aio_write, + .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 73f3de7a083..2b0a627cb5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -274,18 +274,32 @@ struct io_ctl { }; static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, - struct btrfs_root *root) + struct btrfs_root *root, int write) { + int num_pages; + int check_crcs = 0; + + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + check_crcs = 1; + + /* Make sure we can fit our crcs into the first page */ + if (write && check_crcs && + (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + return -ENOSPC; + memset(io_ctl, 0, sizeof(struct io_ctl)); - io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, - GFP_NOFS); + + io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; + + io_ctl->num_pages = num_pages; io_ctl->root = root; - if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) - io_ctl->check_crcs = 1; + io_ctl->check_crcs = check_crcs; + return 0; } @@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, generation = btrfs_free_space_generation(leaf, header); btrfs_release_path(path); + if (!BTRFS_I(inode)->generation) { + btrfs_info(root->fs_info, + "The free space cache file (%llu) is invalid. skip it\n", + offset); + return 0; + } + if (BTRFS_I(inode)->generation != generation) { btrfs_err(root->fs_info, "free space inode generation (%llu) " @@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - ret = io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root, 0); if (ret) return ret; @@ -831,7 +852,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, if (!matched) { __btrfs_remove_free_space_cache(ctl); - btrfs_err(fs_info, "block group %llu has wrong amount of free space", + btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->key.objectid); ret = -1; } @@ -843,7 +864,7 @@ out: spin_unlock(&block_group->lock); ret = 0; - btrfs_err(fs_info, "failed to load free space cache for block group %llu", + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", block_group->key.objectid); } @@ -851,90 +872,44 @@ out: return ret; } -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert - * - * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. - */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) { - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct rb_node *node; - struct list_head *pos, *n; - struct extent_state *cached_state = NULL; - struct btrfs_free_cluster *cluster = NULL; - struct extent_io_tree *unpin = NULL; - struct io_ctl io_ctl; - struct list_head bitmap_list; - struct btrfs_key key; - u64 start, extent_start, extent_end, len; - int entries = 0; - int bitmaps = 0; int ret; - int err = -1; - - INIT_LIST_HEAD(&bitmap_list); - - if (!i_size_read(inode)) - return -1; - - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return -1; + struct btrfs_free_cluster *cluster = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) + if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_entry(block_group->cluster_list.next, struct btrfs_free_cluster, block_group_list); + } - /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); - - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); cluster = NULL; } - /* Make sure we can fit our crcs into the first page */ - if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) - goto out_nospc; - - io_ctl_set_generation(&io_ctl, trans->transid); - /* Write out the extent entries */ while (node) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; + *entries += 1; - ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, e->bitmap); if (ret) - goto out_nospc; + goto fail; if (e->bitmap) { - list_add_tail(&e->list, &bitmap_list); - bitmaps++; + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; } node = rb_next(node); if (!node && cluster) { @@ -942,136 +917,289 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } } + return 0; +fail: + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int +write_pinned_extent_entries(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct io_ctl *io_ctl, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct extent_io_tree *unpin = NULL; + int ret; + + if (!block_group) + return 0; /* * We want to add any pinned extents to our free space cache * so we don't leak the space - */ - - /* + * * We shouldn't have switched the pinned extents yet so this is the * right one */ unpin = root->fs_info->pinned_extents; - if (block_group) - start = block_group->key.objectid; + start = block_group->key.objectid; - while (block_group && (start < block_group->key.objectid + - block_group->key.offset)) { + while (start < block_group->key.objectid + block_group->key.offset) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL); - if (ret) { - ret = 0; - break; - } + if (ret) + return 0; /* This pinned extent is out of our range */ if (extent_start >= block_group->key.objectid + block_group->key.offset) - break; + return 0; extent_start = max(extent_start, start); extent_end = min(block_group->key.objectid + block_group->key.offset, extent_end + 1); len = extent_end - extent_start; - entries++; - ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); if (ret) - goto out_nospc; + return -ENOSPC; start = extent_end; } + return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + int ret; + /* Write out the bitmaps */ - list_for_each_safe(pos, n, &bitmap_list) { + list_for_each_safe(pos, n, bitmap_list) { struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) - goto out_nospc; + return -ENOSPC; list_del_init(&entry->list); } - /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); - - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, - 0, i_size_read(inode), &cached_state); - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + return 0; +} - if (ret) - goto out; +static int flush_dirty_cache(struct inode *inode) +{ + int ret; ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); - if (ret) { + if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, GFP_NOFS); - goto out; - } - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; + return ret; +} - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); - goto out; +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); } - leaf = path->nodes[0]; - if (ret > 0) { - struct btrfs_key found_key; - ASSERT(path->slots[0]); - path->slots[0]--; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || - found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); - btrfs_release_path(path); + io_ctl_drop_pages(io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, cached_state, + GFP_NOFS); +} + +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct extent_state *cached_state = NULL; + struct io_ctl io_ctl; + LIST_HEAD(bitmap_list); + int entries = 0; + int bitmaps = 0; + int ret; + + if (!i_size_read(inode)) + return -1; + + ret = io_ctl_init(&io_ctl, inode, root, 1); + if (ret) + return -1; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + down_write(&block_group->data_rwsem); + spin_lock(&block_group->lock); + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + up_write(&block_group->data_rwsem); + BTRFS_I(inode)->generation = 0; + ret = 0; goto out; } + spin_unlock(&block_group->lock); } - BTRFS_I(inode)->generation = trans->transid; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - btrfs_set_free_space_entries(leaf, header, entries); - btrfs_set_free_space_bitmaps(leaf, header, bitmaps); - btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state); + + io_ctl_set_generation(&io_ctl, trans->transid); + + /* Write out the extent entries in the free space cache */ + ret = write_cache_extent_entries(&io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc; + + /* + * Some spaces that are freed in the current transaction are pinned, + * they will be added into free space cache after the transaction is + * committed, we shouldn't lose them. + */ + ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); + if (ret) + goto out_nospc; + + /* At last, we write out all the bitmaps. */ + ret = write_bitmap_entries(&io_ctl, &bitmap_list); + if (ret) + goto out_nospc; + + /* Zero out the rest of the pages just to make sure */ + io_ctl_zero_remaining_pages(&io_ctl); + + /* Everything is written out, now we dirty the pages in the file. */ + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + if (ret) + goto out_nospc; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + /* + * Release the pages and unlock the extent, we will flush + * them out later + */ + io_ctl_drop_pages(&io_ctl); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; - err = 0; + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + entries, bitmaps); out: io_ctl_free(&io_ctl); - if (err) { + if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return err; + return ret; out_nospc: - list_for_each_safe(pos, n, &bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - list_del_init(&entry->list); - } - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + goto out; } @@ -1091,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } + + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 86935f5ae29..888fbe19079 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -174,7 +174,7 @@ static void start_caching(struct btrfs_root *root) BTRFS_LAST_FREE_OBJECTID - objectid + 1); } - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); if (IS_ERR(tsk)) { btrfs_warn(root->fs_info, "failed to start inode caching task"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f805bc944f..3668048e16f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, @@ -693,7 +693,7 @@ retry: ret = btrfs_reserve_extent(root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, &ins, 1); + 0, alloc_hint, &ins, 1, 1); if (ret) { int i; @@ -794,7 +794,7 @@ retry: out: return ret; out_free_reserve: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + @@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - &ins, 1); + &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -995,7 +995,7 @@ out: return ret; out_reserve: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | @@ -2599,6 +2599,21 @@ out_kfree: return NULL; } +static void btrfs_release_delalloc_bytes(struct btrfs_root *root, + u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, start); + ASSERT(cache); + + spin_lock(&cache->lock); + cache->delalloc_bytes -= len; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -2678,6 +2693,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out_unlock; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2697,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); + if (!ret) + btrfs_release_delalloc_bytes(root, + ordered_extent->start, + ordered_extent->disk_len); } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len, @@ -2749,7 +2769,7 @@ out: !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(root, ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_len, 1); } @@ -2947,14 +2967,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_block_rsv = NULL; spin_unlock(&root->orphan_lock); - if (root->orphan_item_inserted && + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) btrfs_abort_transaction(trans, root, ret); else - root->orphan_item_inserted = 0; + clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state); } if (block_rsv) { @@ -3271,7 +3292,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_block_rsv_release(root, root->orphan_block_rsv, (u64)-1); - if (root->orphan_block_rsv || root->orphan_item_inserted) { + if (root->orphan_block_rsv || + test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); @@ -3473,7 +3495,7 @@ cache_acl: ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(root->fs_info, - "error loading props for ino %llu (root %llu): %d\n", + "error loading props for ino %llu (root %llu): %d", btrfs_ino(inode), root->root_key.objectid, ret); } @@ -3998,7 +4020,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * not block aligned since we will be keeping the last block of the * extent just the way it is. */ - if (root->ref_cows || root == root->fs_info->tree_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, ALIGN(new_size, root->sectorsize), (u64)-1, 0); @@ -4091,7 +4114,9 @@ search_again: extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (root->ref_cows && extent_start != 0) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state) && + extent_start != 0) inode_sub_bytes(inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { @@ -4105,7 +4130,8 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) inode_sub_bytes(inode, num_dec); } } @@ -4120,10 +4146,9 @@ search_again: btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } /* * update the ram bytes to properly reflect @@ -4133,7 +4158,8 @@ search_again: size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); - } else if (root->ref_cows) { + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); } @@ -4155,8 +4181,9 @@ delete: } else { break; } - if (found_extent && (root->ref_cows || - root == root->fs_info->tree_root)) { + if (found_extent && + (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, @@ -5168,8 +5195,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry) static void btrfs_dentry_release(struct dentry *dentry) { - if (dentry->d_fsdata) - kfree(dentry->d_fsdata); + kfree(dentry->d_fsdata); } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, @@ -5553,6 +5579,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; + int nitems = name ? 2 : 1; unsigned long ptr; int ret; @@ -5572,7 +5599,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ inode->i_ino = objectid; - if (dir) { + if (dir && name) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(dir, index); @@ -5581,6 +5608,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, iput(inode); return ERR_PTR(ret); } + } else if (dir) { + *index = 0; } /* * index_cnt is ignored for everything but a dir, @@ -5605,21 +5634,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; - /* - * Start new inodes with an inode_ref. This is slightly more - * efficient for small numbers of hard links since they will - * be packed into one item. Extended refs will kick in if we - * add more hard links than can fit in the ref item. - */ - key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); + + if (name) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail; @@ -5632,12 +5664,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (name) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); @@ -5673,7 +5707,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return inode; fail: - if (dir) + if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); iput(inode); @@ -5958,6 +5992,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_update_inode(trans, root, inode); if (err) goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, inode); + if (err) + goto fail; + } d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -6086,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); - if (ret) { - char *kaddr = kmap_atomic(page); - unsigned long copy_size = min_t(u64, - PAGE_CACHE_SIZE - pg_offset, - max_size - extent_offset); - memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr); - } kfree(tmp); - return 0; + return ret; } /* @@ -6113,7 +6148,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, { int ret; int err = 0; - u64 bytenr; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6127,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compress_type; + const bool new_inline = !page || create; again: read_lock(&em_tree->lock); @@ -6201,7 +6235,6 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -6236,32 +6269,10 @@ next: goto not_found_em; } - em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, item); - em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); - bytenr = btrfs_file_extent_disk_bytenr(leaf, item); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; - goto insert; - } - if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - em->block_start = bytenr; - em->block_len = em->orig_block_len; - } else { - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; - em->block_len = em->len; - if (found_type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; @@ -6270,12 +6281,8 @@ next: size_t extent_offset; size_t copy_size; - em->block_start = EXTENT_MAP_INLINE; - if (!page || create) { - em->start = extent_start; - em->len = extent_end - extent_start; + if (new_inline) goto out; - } size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; @@ -6285,10 +6292,6 @@ next: em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; - if (compress_type) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != @@ -6296,7 +6299,10 @@ next: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + err = ret; + goto out; + } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6332,8 +6338,6 @@ next: set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; - } else { - WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; @@ -6550,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, - alloc_hint, &ins, 1); + alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); return em; } ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); free_extent_map(em); return ERR_PTR(ret); } @@ -6717,6 +6721,76 @@ out: return ret; } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ + struct radix_tree_root *root = &inode->i_mapping->page_tree; + int found = false; + void **pagep = NULL; + struct page *page = NULL; + int start_idx; + int end_idx; + + start_idx = start >> PAGE_CACHE_SHIFT; + + /* + * end is the last byte in the last page. end == start is legal + */ + end_idx = end >> PAGE_CACHE_SHIFT; + + rcu_read_lock(); + + /* Most of the code in this while loop is lifted from + * find_get_page. It's been modified to begin searching from a + * page and return just the first page found in that range. If the + * found idx is less than or equal to the end idx then we know that + * a page exists. If no pages are found or if those pages are + * outside of the range then we're fine (yay!) */ + while (page == NULL && + radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + page = NULL; + continue; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + page = NULL; + break; /* TODO: Is this relevant for this use case? */ + } + + if (!page_cache_get_speculative(page)) { + page = NULL; + continue; + } + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + page = NULL; + } + } + + if (page) { + if (page->index <= end_idx) + found = true; + page_cache_release(page); + } + + rcu_read_unlock(); + return found; +} + static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, int writing) { @@ -6741,10 +6815,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * invalidate needs to happen so that reads after a write do not * get stale data. */ - if (!ordered && (!writing || - !test_range_bit(&BTRFS_I(inode)->io_tree, - lockstart, lockend, EXTENT_UPTODATE, 0, - *cached_state))) + if (!ordered && + (!writing || + !btrfs_page_exists_in_range(inode, lockstart, lockend))) break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -7126,7 +7199,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) * before atomic variable goto zero, we must make sure * dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } /* if there are more bios still pending for this dio, just exit */ @@ -7306,7 +7379,7 @@ out_err: * before atomic variable goto zero, we must * make sure dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -7383,7 +7456,7 @@ free_ordered: if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len); + ordered->disk_len, 1); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -7391,39 +7464,30 @@ free_ordered: } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + const struct iov_iter *iter, loff_t offset) { int seg; int i; - size_t size; - unsigned long addr; unsigned blocksize_mask = root->sectorsize - 1; ssize_t retval = -EINVAL; - loff_t end = offset; if (offset & blocksize_mask) goto out; - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - - /* If this is a write we don't need to check anymore */ - if (rw & WRITE) - continue; + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (i = seg + 1; i < nr_segs; i++) { - if (iov[seg].iov_base == iov[i].iov_base) + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) goto out; } } @@ -7433,8 +7497,7 @@ out: } static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -7444,12 +7507,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, bool relock = false; ssize_t ret; - if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, - offset, nr_segs)) + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) return 0; atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * The generic stuff only does filemap_write_and_wait_range, which @@ -7457,7 +7519,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, * we need to flush the dirty pages again to make absolutely sure * that any outstanding dirty pages are on disk. */ - count = iov_length(iov, nr_segs); + count = iov_iter_count(iter); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) filemap_fdatawrite_range(inode->i_mapping, offset, count); @@ -7484,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { if (ret < 0 && ret != -EIOCBQUEUED) @@ -7992,7 +8054,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d\n", + "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); err = btrfs_update_inode(trans, new_root, inode); @@ -8311,7 +8373,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, @@ -8765,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1); + *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -8779,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, BTRFS_FILE_EXTENT_PREALLOC); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); + ins.offset, 0); btrfs_abort_transaction(trans, root, ret); if (own_trans) btrfs_end_transaction(trans, root); @@ -8889,6 +8951,66 @@ static int btrfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + u64 objectid; + u64 index; + int ret = 0; + + /* + * 5 units required for adding orphan entry + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + goto out; + + inode = btrfs_new_inode(trans, root, dir, NULL, 0, + btrfs_ino(dir), objectid, mode, &index); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + goto out; + } + + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out; + + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + + d_tmpfile(dentry, inode); + mark_inode_dirty(inode); + +out: + btrfs_end_transaction(trans, root); + if (ret) + iput(inode); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -8909,6 +9031,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2ad7de94efe..47aceb494d1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -58,6 +58,7 @@ #include "dev-replace.h" #include "props.h" #include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -135,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* @@ -638,11 +642,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_trans_handle *trans; int ret; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; atomic_inc(&root->will_be_snapshoted); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); btrfs_wait_nocow_write(root); ret = btrfs_start_delalloc_inodes(root, 0); @@ -711,6 +715,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -1502,11 +1535,12 @@ static noinline int btrfs_ioctl_resize(struct file *file, sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { - char *end; sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_free; if (!devid) { ret = -EINVAL; goto out_free; @@ -1562,7 +1596,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { - ret = -EINVAL; + ret = -ERANGE; goto out_free; } new_size = old_size + new_size; @@ -1926,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - char *buf, + size_t *buf_size, + char __user *ubuf, unsigned long *sk_offset, int *num_found) { @@ -1958,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root, if (!key_in_sk(key, sk)) continue; - if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + if (sizeof(sh) + item_len > *buf_size) { + if (*num_found) { + ret = 1; + goto out; + } + + /* + * return one empty item back for v1, which does not + * handle -EOVERFLOW + */ + + *buf_size = sizeof(sh) + item_len; item_len = 0; + ret = -EOVERFLOW; + } - if (sizeof(sh) + item_len + *sk_offset > - BTRFS_SEARCH_ARGS_BUFSIZE) { + if (sizeof(sh) + item_len + *sk_offset > *buf_size) { ret = 1; - goto overflow; + goto out; } sh.objectid = key->objectid; @@ -1974,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root, sh.transid = found_transid; /* copy search result header */ - memcpy(buf + *sk_offset, &sh, sizeof(sh)); + if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = -EFAULT; + goto out; + } + *sk_offset += sizeof(sh); if (item_len) { - char *p = buf + *sk_offset; + char __user *up = ubuf + *sk_offset; /* copy the item */ - read_extent_buffer(leaf, p, - item_off, item_len); + if (read_extent_buffer_to_user(leaf, up, + item_off, item_len)) { + ret = -EFAULT; + goto out; + } + *sk_offset += item_len; } (*num_found)++; - if (*num_found >= sk->nr_items) - break; + if (ret) /* -EOVERFLOW from above */ + goto out; + + if (*num_found >= sk->nr_items) { + ret = 1; + goto out; + } } advance_key: ret = 0; @@ -2002,22 +2062,37 @@ advance_key: key->objectid++; } else ret = 1; -overflow: +out: + /* + * 0: all items from this leaf copied, continue with next + * 1: * more items can be copied, but unused buffer is too small + * * all items were found + * Either way, it will stops the loop which iterates to the next + * leaf + * -EOVERFLOW: item was to large for buffer + * -EFAULT: could not copy extent buffer back to userspace + */ return ret; } static noinline int search_ioctl(struct inode *inode, - struct btrfs_ioctl_search_args *args) + struct btrfs_ioctl_search_key *sk, + size_t *buf_size, + char __user *ubuf) { struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_ioctl_search_key *sk = &args->key; struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; int ret; int num_found = 0; unsigned long sk_offset = 0; + if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { + *buf_size = sizeof(struct btrfs_ioctl_search_header); + return -EOVERFLOW; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2051,14 +2126,15 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, args->buf, + ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); - if (ret || num_found >= sk->nr_items) + if (ret) break; } - ret = 0; + if (ret > 0) + ret = 0; err: sk->nr_items = num_found; btrfs_free_path(path); @@ -2068,22 +2144,73 @@ err: static noinline int btrfs_ioctl_tree_search(struct file *file, void __user *argp) { - struct btrfs_ioctl_search_args *args; - struct inode *inode; - int ret; + struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_key sk; + struct inode *inode; + int ret; + size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = memdup_user(argp, sizeof(*args)); - if (IS_ERR(args)) - return PTR_ERR(args); + uargs = (struct btrfs_ioctl_search_args __user *)argp; + + if (copy_from_user(&sk, &uargs->key, sizeof(sk))) + return -EFAULT; + + buf_size = sizeof(uargs->buf); inode = file_inode(file); - ret = search_ioctl(inode, args); - if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + + /* + * In the origin implementation an overflow is handled by returning a + * search header with a len of zero, so reset ret. + */ + if (ret == -EOVERFLOW) + ret = 0; + + if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) ret = -EFAULT; - kfree(args); + return ret; +} + +static noinline int btrfs_ioctl_tree_search_v2(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 args; + struct inode *inode; + int ret; + size_t buf_size; + const size_t buf_limit = 16 * 1024 * 1024; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* copy search header and buffer size */ + uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + buf_size = args.buf_size; + + if (buf_size < sizeof(struct btrfs_ioctl_search_header)) + return -EOVERFLOW; + + /* limit result size to 16MB */ + if (buf_size > buf_limit) + buf_size = buf_limit; + + inode = file_inode(file); + ret = search_ioctl(inode, &args.key, &buf_size, + (char *)(&uarg->buf[0])); + if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) + ret = -EFAULT; + else if (ret == -EOVERFLOW && + copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) + ret = -EFAULT; + return ret; } @@ -2219,6 +2346,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; + u64 root_flags; u64 qgroup_reserved; int namelen; int ret; @@ -2240,6 +2368,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) goto out_drop_write; @@ -2301,6 +2430,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } mutex_lock(&inode->i_mutex); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the i_mutex so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); + err = -EPERM; + goto out_dput; + } + err = d_invalidate(dentry); if (err) goto out_unlock; @@ -2346,7 +2496,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - if (!xchg(&dest->orphan_item_inserted, 1)) { + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); @@ -2389,11 +2539,19 @@ out_release: out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + ASSERT(dest->send_in_progress == 0); /* the last ref */ if (dest->cache_inode) { @@ -2557,9 +2715,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); if (!fi_args) return -ENOMEM; @@ -2574,6 +2729,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) } mutex_unlock(&fs_devices->device_list_mutex); + fi_args->nodesize = root->fs_info->super_copy->nodesize; + fi_args->sectorsize = root->fs_info->super_copy->sectorsize; + fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; @@ -2589,9 +2748,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) int ret = 0; char *s_uuid = NULL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); @@ -2669,10 +2825,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); ordered = btrfs_lookup_first_ordered_extent(inode, off + len - 1); - if (!ordered && + if ((!ordered || + ordered->file_offset + ordered->len <= off || + ordered->file_offset >= off + len) && !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); break; + } unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); @@ -2912,6 +3073,129 @@ out: return ret; } +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr. If it does then we need to update the quota for this root. This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 disko) +{ + struct seq_list tree_mod_seq_elem = {}; + struct ulist *roots; + struct ulist_iterator uiter; + struct ulist_node *root_node = NULL; + int ret; + + if (!root->fs_info->quota_enabled) + return 1; + + btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + ret = btrfs_find_all_roots(trans, root->fs_info, disko, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + ret = 0; + ULIST_ITER_INIT(&uiter); + while ((root_node = ulist_next(roots, &uiter))) { + if (root_node->val == root->objectid) { + ret = 1; + break; + } + } + ulist_free(roots); +out: + btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + return ret; +} + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + return; + } + + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_extent_item_to_extent_map(inode, path, fi, false, em); + em->generation = -1; + if (btrfs_file_extent_type(path->nodes[0], fi) == + BTRFS_FILE_EXTENT_INLINE) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + em->start = hole_offset; + em->len = hole_len; + em->ram_bytes = em->len; + em->orig_start = hole_offset; + em->block_start = EXTENT_MAP_HOLE; + em->block_len = 0; + em->orig_block_len = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = trans->transid; + } + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + + if (unlikely(ret)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -2924,7 +3208,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -2935,7 +3220,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - u64 len = olen_aligned; + int no_quota; + const u64 len = olen_aligned; + u64 last_disko = 0; + u64 last_dest_end = destoff; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -2952,7 +3240,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* @@ -2964,9 +3252,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode, 0, 0); if (ret < 0) goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } nritems = btrfs_header_nritems(path->nodes[0]); process_slot: + no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -2991,7 +3291,7 @@ process_slot: u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; - u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3012,10 +3312,16 @@ process_slot: extent); } - if (key.offset + datal <= off || - key.offset >= off + len - 1) { + /* + * The first search might have left us at an extent + * item that ends before our target range's start, can + * happen if we have holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { path->slots[0]++; goto process_slot; + } else if (key.offset >= off + len) { + break; } size = btrfs_item_size_nr(leaf, slot); @@ -3034,6 +3340,18 @@ process_slot: new_key.offset = destoff; /* + * Deal with a hole that doesn't have an extent item + * that represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning + * range or at the beginning (fully overlaps it or + * partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3051,18 +3369,18 @@ process_slot: * | ------------- extent ------------- | */ - /* substract range b */ + /* subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; - /* substract range a */ + /* subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, + drop_start, new_key.offset + datal, 1); if (ret) { @@ -3099,6 +3417,28 @@ process_slot: datao); btrfs_set_file_extent_num_bytes(leaf, extent, datal); + + /* + * We need to look up the roots that point at + * this bytenr and see if the new root does. If + * it does not we need to make sure we update + * quotas appropriately. + */ + if (disko && root != BTRFS_I(src)->root && + disko != last_disko) { + no_quota = check_ref(trans, root, + disko); + if (no_quota < 0) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + ret = no_quota; + goto out; + } + } + if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, @@ -3106,7 +3446,7 @@ process_slot: root->root_key.objectid, btrfs_ino(inode), new_key.offset - datao, - 0); + no_quota); if (ret) { btrfs_abort_transaction(trans, root, @@ -3120,6 +3460,8 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3136,9 +3478,11 @@ process_slot: size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, - new_key.offset + datal, + drop_start, + aligned_end, 1); if (ret) { if (ret != -EOPNOTSUPP) @@ -3172,38 +3516,62 @@ process_slot: inode_add_bytes(inode, datal); } + /* If we have an implicit hole (NO_HOLES feature). */ + if (drop_start < new_key.offset) + clone_update_extent_map(inode, trans, + NULL, drop_start, + new_key.offset - drop_start); + + clone_update_extent_map(inode, trans, path, 0, 0); + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - /* - * we round up to the block size at eof when - * determining which extents to clone above, - * but shouldn't round up the file size - */ - endoff = new_key.offset + datal; - if (endoff > destoff+olen) - endoff = destoff+olen; - if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); + last_dest_end = new_key.offset + datal; + ret = clone_finish_inode_update(trans, inode, + last_dest_end, + destoff, olen); + if (ret) goto out; - } - ret = btrfs_end_transaction(trans, root); + if (new_key.offset + datal >= destoff + len) + break; } btrfs_release_path(path); key.offset++; } ret = 0; + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole (NO_HOLES feature is enabled) that + * fully or partially overlaps our cloning range at its end. + */ + btrfs_release_path(path); + + /* + * 1 - remove extent(s) + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + last_dest_end, destoff + len, 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); + } + out: - btrfs_release_path(path); btrfs_free_path(path); vfree(buf); return ret; @@ -3315,15 +3683,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + /* + * Lock the target range too. Right after we replace the file extent + * items in the fs tree (which now point to the cloned data), we might + * have a worker replace them with extent items relative to a write + * operation that was issued before this clone operation (i.e. confront + * with inode.c:btrfs_finish_ordered_io). + */ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, off, len); + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } ret = btrfs_clone(src, inode, off, olen, len, destoff); - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); + } else { + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, + destoff + len - 1); + } + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: if (!same_inode) { if (inode < src) { @@ -4898,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_trans_end(file); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_TREE_SEARCH_V2: + return btrfs_ioctl_tree_search_v2(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); case BTRFS_IOC_INO_PATHS: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 01277b8f237..5665d214924 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: + BUG_ON(!atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner); + read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers) && current->pid == eb->lock_owner) { @@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) if (atomic_read(&eb->blocking_writers)) return 0; - read_lock(&eb->lock); + if (!read_trylock(&eb->lock)) + return 0; + if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); return 0; @@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) return 0; - write_lock(&eb->lock); + + if (!write_trylock(&eb->lock)) + return 0; + if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); @@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); @@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); @@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) BUG_ON(blockers > 1); btrfs_assert_tree_locked(eb); + eb->lock_owner = 0; atomic_dec(&eb->write_locks); if (blockers) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b47f669aca7..dfad8514f0d 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws, if (ret != LZO_E_OK) { printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); - ret = -1; + ret = -EIO; goto out; } @@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } @@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws, /* we're making it bigger, give up */ if (tot_in > 8192 && tot_in < tot_out) { - ret = -1; + ret = -E2BIG; goto out; } @@ -335,7 +335,7 @@ cont: break; if (page_in_index + 1 >= total_pages_in) { - ret = -1; + ret = -EIO; goto done; } @@ -358,7 +358,7 @@ cont: kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed\n"); - ret = -1; + ret = -EIO; break; } @@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed!\n"); - ret = -1; + ret = -EIO; goto out; } if (out_len < start_byte) { - ret = -1; + ret = -EIO; goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a94b05f7286..7187b14faa6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", offset); + "%llu", offset); } /* @@ -484,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) log_list); list_del_init(&ordered->log_list); spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d3b64..9626b4ad3b9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2cf905877aa..98cb6b2630f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -32,6 +32,7 @@ #include "ulist.h" #include "backref.h" #include "extent_io.h" +#include "qgroup.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -84,8 +85,8 @@ struct btrfs_qgroup { /* * temp variables for accounting operations */ - u64 tag; - u64 refcnt; + u64 old_refcnt; + u64 new_refcnt; }; /* @@ -98,6 +99,9 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, return -ENOENT; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + return 0; +#endif path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -1174,33 +1201,198 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return -1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} /* - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts - * the modification into a list that's later used by btrfs_end_transaction to - * pass the recorded modifications on to btrfs_qgroup_account_ref. + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. */ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) { - struct qgroup_update *u; + struct btrfs_qgroup_operation *oper; + int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; - BUG_ON(!trans->delayed_ref_elem.seq); - u = kmalloc(sizeof(*u), GFP_NOFS); - if (!u) + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) return -ENOMEM; - u->node = node; - u->extent_op = extent_op; - list_add_tail(&u->list, &trans->qgroup_ref_list); + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); return 0; } -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq) +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct btrfs_qgroup *qgroup; + struct ulist *tmp; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); + qgroup->excl += sign * oper->num_bytes; + qgroup->excl_cmpr += sign * oper->num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + qgroup->excl += sign * oper->num_bytes; + if (sign < 0) + WARN_ON(qgroup->excl < oper->num_bytes); + qgroup->excl_cmpr += sign * oper->num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1211,256 +1403,551 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; ulist_reinit(tmp); - /* XXX id not needed */ - ret = ulist_add(tmp, qg->qgroupid, - (u64)(uintptr_t)qg, GFP_ATOMIC); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; else - ++qg->refcnt; + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; ret = ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + + ulist_reinit(tmp); + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } return 0; } -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes, - struct btrfs_qgroup *qgroup) +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; int ret; ulist_reinit(tmp); - ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); if (ret < 0) return ret; - ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * num_bytes; - qg->rfer_cmpr += sgn * num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * num_bytes; - qg->excl_cmpr += sgn * num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; + struct btrfs_qgroup_list *glist; + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } - return 0; } -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes) +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - int ret; + u64 cur_new_count, cur_old_count; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; - ulist_reinit(tmp); - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - if (ret < 0) - return ret; + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * num_bytes; - qg->excl_cmpr -= sgn * num_bytes; - qgroup_dirty(fs_info, qg); - } + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) - return ret; - } + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; } - } + if (dirty) + qgroup_dirty(fs_info, qg); + } return 0; } /* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. */ -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) { - struct btrfs_root *quota_root; - u64 ref_root; - struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - u64 seq; + struct ulist_node *unode; + struct ulist_iterator uiter; int ret = 0; - int sgn; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - struct btrfs_delayed_tree_ref *ref; - ref = btrfs_delayed_node_to_tree_ref(node); - ref_root = ref->root; - } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - ref_root = ref->root; - } else { - BUG(); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); - if (!is_fstree(ref_root)) { - /* - * non-fs-trees are not being accounted - */ - return 0; - } + return ret; +} - switch (node->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - sgn = 1; - seq = btrfs_tree_mod_seq_prev(node->seq); - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - seq = node->seq; - break; - case BTRFS_UPDATE_DELAYED_HEAD: - return 0; - default: - BUG(); - } +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = {}; + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) return 0; - } } - mutex_unlock(&fs_info->qgroup_rescan_lock); - /* - * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass - * tree_mod_log_prev_seq(node->seq) to skip - * the delayed ref's current sequence number, because we need the state - * of the tree before the add operation. for delete operations, we pass - * (node->seq) to include the delayed ref's current sequence number, - * because we need the state of the tree after the delete operation. - */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); - if (ret < 0) - return ret; - - spin_lock(&fs_info->qgroup_lock); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) - goto unlock; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) { + ulist_free(qgroups); + return -ENOMEM; + } - qgroup = find_qgroup_rb(fs_info, ref_root); + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); if (!qgroup) - goto unlock; + goto out; + seq = fs_info->qgroup_seq; /* - * step 1: for each old ref, visit all nodes once and inc refcnt + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... */ - ulist_reinit(fs_info->qgroup_ulist); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; - ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, - seq); - if (ret) - goto unlock; + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; /* - * step 2: walk from the new root + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. */ - ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes, qgroup); - if (ret) - goto unlock; + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; /* - * step 3: walk again from old refs + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. */ - ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes); - if (ret) - goto unlock; + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; -unlock: + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } return ret; } @@ -1629,8 +2116,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; - dstgroup->rfer = srcgroup->rfer - level_size; - dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; qgroup_dirty(fs_info, dstgroup); @@ -1734,7 +2229,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -1766,7 +2261,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved += num_bytes; } @@ -1812,7 +2307,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved -= num_bytes; @@ -1848,15 +2343,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *tmp, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; struct seq_list tree_mod_seq_elem = {}; + u64 num_bytes; u64 seq; + int new_roots; int slot; int ret; @@ -1897,8 +2392,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { - u64 num_bytes; - btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -1908,76 +2401,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ret = btrfs_find_all_roots(trans, fs_info, found.objectid, - tree_mod_seq_elem.seq, &roots); + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); if (ret < 0) goto out; spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); - if (ret) { + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); goto out; } - /* - * step2 of btrfs_qgroup_account_ref works from a single root, - * we're doing all at once here. - */ - ulist_reinit(tmp); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } - - /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += num_bytes; - qg->rfer_cmpr += num_bytes; - WARN_ON(qg->tag >= seq); - if (qg->refcnt - seq == roots->nnodes) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - } - qgroup_dirty(fs_info, qg); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; } - spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); - ret = 0; } - out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -1990,13 +2441,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; path = btrfs_alloc_path(); if (!path) goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; tmp = ulist_alloc(GFP_NOFS); if (!tmp) goto out; @@ -2015,7 +2469,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - tmp, scratch_leaf); + qgroups, tmp, scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2025,6 +2479,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); + ulist_free(qgroups); ulist_free(tmp); btrfs_free_path(path); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 00000000000..5952ff1fbd7 --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction. The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding. This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols. This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { + BTRFS_QGROUP_OPER_ADD_EXCL, + BTRFS_QGROUP_OPER_ADD_SHARED, + BTRFS_QGROUP_OPER_SUB_EXCL, + BTRFS_QGROUP_OPER_SUB_SHARED, +}; + +struct btrfs_qgroup_operation { + u64 ref_root; + u64 bytenr; + u64 num_bytes; + u64 seq; + enum btrfs_qgroup_operation_type type; + struct seq_list elem; + struct rb_node n; + struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, + int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291a523..4a88f073fdd 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 30947f92362..09230cf3a24 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, continue; } if (!dev->bdev) { - /* cannot read ahead on missing device */ - continue; + /* + * cannot read ahead on missing device, but for RAID5/6, + * REQ_GET_READ_MIRRORS return 1. So don't skip missing + * device for such case. + */ + if (nzones > 1) + continue; } if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f92ab1daa8..65245a07275 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) if (bnode->root) fs_info = bnode->root->fs_info; btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", bytenr); + "found at offset %llu", bytenr); } /* @@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root) { struct btrfs_root *reloc_root; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; reloc_root = root->reloc_root; @@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, root_objectid); BUG_ON(IS_ERR(root)); - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && generation != btrfs_root_generation(&root->root_item)) return NULL; @@ -887,7 +887,7 @@ again: goto out; } - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) cur->cowonly = 1; if (btrfs_root_level(&root->root_item) == cur->level) { @@ -954,7 +954,8 @@ again: upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) upper->cowonly = 1; /* @@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (rb_node) { btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " - "tree\n", node->bytenr); + "tree", node->bytenr); kfree(node); return -EEXIST; } @@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, next = walk_up_backref(next, edges, &index); root = next->root; BUG_ON(!root); - BUG_ON(!root->ref_cows); + BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); @@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, BUG_ON(!root); /* no other choice for non-references counted tree */ - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return root; if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) @@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || root->ref_cows) { + if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; } if (root) { - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); BUG_ON(!list_empty(&node->list)); btrfs_record_root_in_trans(trans, root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 38bb47e7d6b..360a728a639 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) break; } - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); if (err) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0be77993378..b6d198f5181 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { - ret = tree_backref_for_extent(&ptr, eb, ei, item_size, - &ref_root, &ref_level); + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " @@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) out: if (page) put_page(page); - if (inode) - iput(inode); + + iput(inode); if (ret < 0) return ret; @@ -2724,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) { - key.offset = found_key.offset + length; - btrfs_release_path(path); - continue; - } + if (found_key.offset + length <= start) + goto skip; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2739,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * the chunk from going away while we scrub it */ cache = btrfs_lookup_block_group(fs_info, chunk_offset); - if (!cache) { - ret = -ENOENT; - break; - } + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -2801,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - +skip: key.offset = found_key.offset + length; btrfs_release_path(path); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index eb6537a08c1..6528aa66218 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -360,10 +360,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) /* * First time the inline_buf does not suffice */ - if (p->buf == p->inline_buf) + if (p->buf == p->inline_buf) { tmp_buf = kmalloc(len, GFP_NOFS); - else + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); + } else { tmp_buf = krealloc(p->buf, len, GFP_NOFS); + } if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; @@ -972,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - const int buf_len = PATH_MAX; + int buf_len; u32 name_len; u32 data_len; u32 cur; @@ -982,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; + if (found_key->type == BTRFS_XATTR_ITEM_KEY) + buf_len = BTRFS_MAX_XATTR_SIZE(root); + else + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1003,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - /* - * Path too long - */ - if (name_len + data_len > buf_len) { - ret = -ENAMETOOLONG; - goto out; + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > buf_len) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > buf_len) { + ret = -ENAMETOOLONG; + goto out; + } } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -1346,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -EIO; btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " - "disk_byte=%llu found extent=%llu\n", + "disk_byte=%llu found extent=%llu", ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -1625,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } *found_inode = key.objectid; *found_type = btrfs_dir_type(path->nodes[0], di); @@ -1668,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -1690,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; btrfs_release_path(path); - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } *dir = parent_dir; @@ -1709,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root, int ret; struct fs_path *tmp_name; u64 tmp_dir; - u64 tmp_dir_gen; tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out; @@ -2026,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; /* @@ -2052,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, } } - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in @@ -2131,7 +2150,6 @@ out_cache: name_cache_clean_unused(sctx); out: - btrfs_free_path(path); return ret; } @@ -2942,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx, static int add_pending_dir_move(struct send_ctx *sctx, u64 ino, u64 ino_gen, - u64 parent_ino) + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2974,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx, } } - list_for_each_entry(cur, &sctx->deleted_refs, list) { + list_for_each_entry(cur, deleted_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; } - list_for_each_entry(cur, &sctx->new_refs, list) { + list_for_each_entry(cur, new_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; @@ -3022,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3033,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; + u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3051,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (ret < 0) goto out; - if (parent_ino == sctx->cur_ino) { - /* child only renamed, not moved */ - ASSERT(parent_gen == sctx->cur_inode_gen); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); - if (ret < 0) - goto out; - } else { - /* child moved and maybe renamed too */ - sctx->send_progress = pm->ino; - ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs); if (ret < 0) goto out; - } - - fs_path_free(name); - name = NULL; - - to_path = fs_path_alloc(); - if (!to_path) { - ret = -ENOMEM; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } goto out; } - - sctx->send_progress = sctx->cur_ino + 1; + fs_path_reset(name); + to_path = name; + name = NULL; ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; @@ -3202,127 +3264,74 @@ out: static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { - int ret; + int ret = 0; u64 ino = parent_ref->dir; u64 parent_ino_before, parent_ino_after; - u64 old_gen; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; int len1, len2; - int register_upper_dirs; - u64 gen; - - if (is_waiting_for_move(sctx, ino)) - return 1; - - if (parent_ref->dir <= sctx->cur_ino) - return 0; - - ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, - NULL, NULL, NULL, NULL); - if (ret == -ENOENT) - return 0; - else if (ret < 0) - return ret; - - if (parent_ref->dir_gen != old_gen) - return 0; - - path_before = fs_path_alloc(); - if (!path_before) - return -ENOMEM; - - ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, - NULL, path_before); - if (ret == -ENOENT) { - ret = 0; - goto out; - } else if (ret < 0) { - goto out; - } path_after = fs_path_alloc(); - if (!path_after) { + path_before = fs_path_alloc(); + if (!path_after || !path_before) { ret = -ENOMEM; goto out; } - ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - &gen, path_after); - if (ret == -ENOENT) { - ret = 0; - goto out; - } else if (ret < 0) { - goto out; - } - - len1 = fs_path_len(path_before); - len2 = fs_path_len(path_after); - if (parent_ino_before != parent_ino_after || len1 != len2 || - memcmp(path_before->start, path_after->start, len1)) { - ret = 1; - goto out; - } - ret = 0; - /* - * Ok, our new most direct ancestor has a higher inode number but - * wasn't moved/renamed. So maybe some of the new ancestors higher in - * the hierarchy have an higher inode number too *and* were renamed - * or moved - in this case we need to wait for the ancestor's rename - * or move operation before we can do the move/rename for the current - * inode. + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed. */ - register_upper_dirs = 0; - ino = parent_ino_after; -again: - while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { - u64 parent_gen; + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + ret = 1; + break; + } fs_path_reset(path_before); fs_path_reset(path_after); ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - &parent_gen, path_after); + NULL, path_after); if (ret < 0) goto out; ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, NULL, path_before); - if (ret == -ENOENT) { - ret = 0; - break; - } else if (ret < 0) { + if (ret < 0 && ret != -ENOENT) { goto out; + } else if (ret == -ENOENT) { + ret = 1; + break; } len1 = fs_path_len(path_before); len2 = fs_path_len(path_after); - if (parent_ino_before != parent_ino_after || len1 != len2 || - memcmp(path_before->start, path_after->start, len1)) { + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { ret = 1; - if (register_upper_dirs) { - break; - } else { - register_upper_dirs = 1; - ino = parent_ref->dir; - gen = parent_ref->dir_gen; - goto again; - } - } else if (register_upper_dirs) { - ret = add_pending_dir_move(sctx, ino, gen, - parent_ino_after); - if (ret < 0 && ret != -EEXIST) - goto out; + break; } - ino = parent_ino_after; - gen = parent_gen; } out: fs_path_free(path_before); fs_path_free(path_after); + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs); + if (!ret) + ret = 1; + } + return ret; } @@ -3483,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = add_pending_dir_move(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - cur->dir); *pending_move = 1; } else { ret = send_rename(sctx, valid_path, @@ -5487,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, - "send_in_progres unbalanced %d root %llu\n", + "send_in_progres unbalanced %d root %llu", root->send_in_progress, root->root_key.objectid); spin_unlock(&root->root_item_lock); } @@ -5515,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) /* * The subvolume must remain read-only during send, protect against - * making it RW. + * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); send_root->send_in_progress++; @@ -5575,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -5664,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) spin_lock(&sctx->parent_root->root_item_lock); sctx->parent_root->send_in_progress++; - if (!btrfs_root_readonly(sctx->parent_root)) { + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9601d25a460..8e16bca69c5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -511,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } else if (compress) { if (!btrfs_test_opt(root, COMPRESS)) btrfs_info(root->fs_info, - "btrfs: use %s compression\n", + "btrfs: use %s compression", compress_type); } break; @@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; @@ -580,8 +581,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } break; case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL root->fs_info->sb->s_flags |= MS_POSIXACL; break; +#else + btrfs_err(root->fs_info, + "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; @@ -1413,6 +1421,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. */ + cancel_work_sync(&fs_info->async_reclaim_work); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); @@ -1459,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; @@ -1800,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } @@ -1894,6 +1907,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_inodes(); + if (ret) + goto out; + ret = btrfs_test_qgroups(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c5eb2143dc6..78699364f53 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group_cache *block_group; - int index = kobj - sinfo->block_group_kobjs; + int index = to_raid_kobj(kobj)->raid_type; u64 val = 0; down_read(&sinfo->groups_sem); @@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = { static void release_raid_kobj(struct kobject *kobj) { - kobject_put(kobj->parent); + kfree(to_raid_kobj(kobj)); } struct kobj_type btrfs_raid_ktype = { @@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_root *root = fs_info->fs_root; int ret; - if (len >= BTRFS_LABEL_SIZE) { - pr_err("BTRFS: unable to set label with more than %d bytes\n", - BTRFS_LABEL_SIZE - 1); + if (len >= BTRFS_LABEL_SIZE) return -EINVAL; - } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) @@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj, } BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); +static ssize_t btrfs_no_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + return -EPERM; +} + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); + static struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), + BTRFS_ATTR_PTR(nodesize), + BTRFS_ATTR_PTR(sectorsize), + BTRFS_ATTR_PTR(clone_alignment), NULL, }; @@ -567,14 +605,37 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -585,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -628,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab576318a8..ac46df37504 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 757ef00a75a..9626252ee6b 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" static struct vfsmount *test_mnt = NULL; @@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void) kern_unmount(test_mnt); unregister_filesystem(&test_type); } + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_NOFS); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + if (init_srcu_struct(&fs_info->subvol_srcu)) { + kfree(fs_info->fs_devices); + kfree(fs_info->super_copy); + kfree(fs_info); + return NULL; + } + + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + fs_info->running_transaction = NULL; + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_ulist = NULL; + atomic64_set(&fs_info->tree_mod_seq, 0); + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + + spin_lock(&fs_info->buffer_lock); +restart: + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + goto restart; + continue; + } + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + cleanup_srcu_struct(&fs_info->subvol_srcu); + kfree(fs_info->super_copy); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->node) + free_extent_buffer(root->node); + if (root->fs_info) + btrfs_free_dummy_fs_info(root->fs_info); + kfree(root); +} + diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 312560a9123..fd395422448 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,13 +23,18 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) +struct btrfs_root; + int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); +int btrfs_test_qgroups(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root); #else static inline int btrfs_test_free_space_cache(void) { @@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void) { return 0; } +static inline int btrfs_test_qgroups(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 397d1f99a8e..3ae0f5b8bb8 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -23,33 +23,6 @@ #include "../extent_io.h" #include "../volumes.h" -static struct btrfs_fs_info *alloc_dummy_fs_info(void) -{ - struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); - if (!fs_info) - return fs_info; - fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); - if (!fs_info->fs_devices) { - kfree(fs_info); - return NULL; - } - return fs_info; -} -static void free_dummy_root(struct btrfs_root *root) -{ - if (!root) - return; - if (root->fs_info) { - kfree(root->fs_info->fs_devices); - kfree(root->fs_info); - } - if (root->node) - free_extent_buffer(root->node); - kfree(root); -} - static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, u64 disk_len, u32 type, u8 compression, int slot) @@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void) * We do this since btrfs_get_extent wants to assign em->bdev to * root->fs_info->fs_devices->latest_bdev. */ - root->fs_info = alloc_dummy_fs_info(); + root->fs_info = btrfs_alloc_dummy_fs_info(); if (!root->fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; @@ -837,7 +810,7 @@ out: if (!IS_ERR(em)) free_extent_map(em); iput(inode); - free_dummy_root(root); + btrfs_free_dummy_root(root); return ret; } @@ -864,7 +837,7 @@ static int test_hole_first(void) goto out; } - root->fs_info = alloc_dummy_fs_info(); + root->fs_info = btrfs_alloc_dummy_fs_info(); if (!root->fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; @@ -934,7 +907,7 @@ out: if (!IS_ERR(em)) free_extent_map(em); iput(inode); - free_dummy_root(root); + btrfs_free_dummy_root(root); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 00000000000..ec3dcb20235 --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2013 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + init_dummy_trans(&trans); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_msg("Couldn't insert ref %d\n", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 1); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_msg("Failed to insert backref\n"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + path->leave_spinning = 1; + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Didn't find our key %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Couldn't find backref %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup basic add\n"); + ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_item(root, 4096, 4096); + if (ret) + return -EINVAL; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_EXCL, 0); + if (ret) { + test_msg("Couldn't remove space from the qgroup %d\n", ret); + return -EINVAL; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup multiple refs test\n"); + + /* We have 5 created already from the previous test */ + ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = add_tree_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(void) +{ + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + return PTR_ERR(root); + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 5; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 256; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to addt he root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; + + test_msg("Running qgroup tests\n"); + ret = test_no_shared_qgroup(root); + if (ret) + goto out; + ret = test_multiple_refs(root); +out: + btrfs_free_dummy_root(root); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7579f6d0b85..5f379affdf2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@ #include "inode-map.h" #include "volumes.h" #include "dev-replace.h" +#include "qgroup.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -241,18 +242,19 @@ loop: static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (root->ref_cows && root->last_trans < trans->transid) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); /* - * see below for in_trans_setup usage rules + * see below for IN_TRANS_SETUP usage rules * we have the reloc mutex held now, so there * is only one writer in this function */ - root->in_trans_setup = 1; + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); - /* make sure readers find in_trans_setup before + /* make sure readers find IN_TRANS_SETUP before * they find our root->last_trans update */ smp_wmb(); @@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * But, we have to set root->last_trans before we * init the relocation root, otherwise, we trip over warnings * in ctree.c. The solution used here is to flag ourselves - * with root->in_trans_setup. When this is 1, we're still + * with root IN_TRANS_SETUP. When this is 1, we're still * fixing up the reloc trees and everyone must wait. * * When this is zero, they can trust root->last_trans and fly @@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * done before we pop in the zero below */ btrfs_init_reloc_root(trans, root); - smp_wmb(); - root->in_trans_setup = 0; + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return 0; } @@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; /* - * see record_root_in_trans for comments about in_trans_setup usage + * see record_root_in_trans for comments about IN_TRANS_SETUP usage * and barriers */ smp_rmb(); if (root->last_trans == trans->transid && - !root->in_trans_setup) + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; mutex_lock(&root->fs_info->reloc_mutex); @@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) static inline bool need_reserve_reloc_root(struct btrfs_root *root) { if (!root->fs_info->reloc_ctl || - !root->ref_cows || + !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; @@ -384,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -489,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } @@ -695,6 +700,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; + int must_run_delayed_refs = 0; if (trans->use_count > 1) { trans->use_count--; @@ -702,14 +708,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } - /* - * do the qgroup accounting as early as possible - */ - err = btrfs_delayed_refs_qgroup_accounting(trans, info); - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + trans->delayed_ref_updates = 0; + if (!trans->sync) { + must_run_delayed_refs = + btrfs_should_throttle_delayed_refs(trans, root); + cur = max_t(unsigned long, cur, 32); + + /* + * don't make the caller wait if they are from a NOLOCK + * or ATTACH transaction, it will deadlock with commit + */ + if (must_run_delayed_refs == 1 && + (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) + must_run_delayed_refs = 2; + } + if (trans->qgroup_reserved) { /* * the same root has to be passed here between start_transaction @@ -719,16 +738,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->qgroup_reserved = 0; } - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); - - trans->delayed_ref_updates = 0; - if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { - cur = max_t(unsigned long, cur, 32); - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -778,6 +787,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (must_run_delayed_refs) { + btrfs_async_run_delayed_refs(root, cur, + must_run_delayed_refs == 1); + } return err; } @@ -1049,8 +1062,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); /* see comments in should_cow_block() */ - root->force_cow = 0; - smp_wmb(); + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); if (root->commit_root != root->node) { list_add_tail(&root->dirty_list, @@ -1081,7 +1094,7 @@ int btrfs_defrag_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - if (xchg(&root->defrag_running, 1)) + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) return 0; while (1) { @@ -1104,7 +1117,7 @@ int btrfs_defrag_root(struct btrfs_root *root) break; } } - root->defrag_running = 0; + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); return ret; } @@ -1168,12 +1181,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto no_free_objectid; } - pending->error = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (pending->error) - goto no_free_objectid; - key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1270,8 +1277,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * We need to flush delayed refs in order to make sure all of our quota + * operations have been done before we call btrfs_qgroup_inherit. + */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + /* see comments in should_cow_block() */ - root->force_cow = 1; + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); btrfs_set_root_node(new_root_item, tmp); @@ -1593,17 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ - if (ret) { - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - return ret; - } - - ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); if (ret) return ret; @@ -1984,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - /* - * Make sure root is not involved in send, - * if we fail with first root, we return - * directly rather than continue. - */ - spin_lock(&root->root_item_lock); - if (root->send_in_progress) { - spin_unlock(&fs_info->trans_lock); - spin_unlock(&root->root_item_lock); - return 0; - } - spin_unlock(&root->root_item_lock); - list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b57b924e8e0..7dd558ed071 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -69,6 +69,7 @@ struct btrfs_transaction { #define __TRANS_ATTACH (1U << 10) #define __TRANS_JOIN (1U << 11) #define __TRANS_JOIN_NOLOCK (1U << 12) +#define __TRANS_DUMMY (1U << 13) #define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 76928ca9774..a63719cc957 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (root->ref_cows == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; if (btrfs_test_opt(root, SSD)) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2f45fc0261..9e1f2cd5e67 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,13 +20,11 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> -#include "ctree.h" -#include "transaction.h" +#include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "tree-log.h" #include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: @@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } - if (!root->log_start_pid) { root->log_start_pid = current->pid; - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } else if (root->log_start_pid != current->pid) { - root->log_multiple_pids = true; + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } atomic_inc(&root->log_batch); @@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ret) goto out; } - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); @@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { + if (!btrfs_test_opt(root, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); goto out; } @@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); + if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); @@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } @@ -2886,7 +2878,7 @@ fail: out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, root, ret); @@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, root, ret); @@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * make sure any commits to the log are forced * to be full commits */ - root->fs_info->last_trans_log_full_commit = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; break; } @@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + /* + * The prev transaction commit doesn't complete, we need do + * full commit by ourselves. + */ if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; @@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 91b145fce33..7f5b41bd537 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +#include "ctree.h" +#include "transaction.h" + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) INIT_LIST_HEAD(&ctx->list); } +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + trans->transid; +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49d7fab7336..6cb82f62cb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -1452,6 +1455,22 @@ out: return ret; } +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ + struct file *filp; + + filp = filp_open(path_name, O_RDWR, 0); + if (!filp) + return; + file_update_time(filp); + filp_close(filp, NULL); + return; +} + static int btrfs_rm_dev_item(struct btrfs_root *root, struct btrfs_device *device) { @@ -1661,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; - if (device->bdev) + if (device->bdev) { device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } call_rcu(&device->rcu, free_device); @@ -1674,11 +1696,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) struct btrfs_fs_devices *fs_devices; fs_devices = root->fs_info->fs_devices; while (fs_devices) { - if (fs_devices->seed == cur_devices) + if (fs_devices->seed == cur_devices) { + fs_devices->seed = cur_devices->seed; break; + } fs_devices = fs_devices->seed; } - fs_devices->seed = cur_devices->seed; cur_devices->seed = NULL; lock_chunks(root); __btrfs_close_devices(cur_devices); @@ -1694,20 +1717,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * remove it from the devices list and zero out the old super */ if (clear_super && disk_super) { + u64 bytenr; + int i; + /* make sure this device isn't detected as part of * the FS anymore */ memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + + /* clear the mirror copies of super block on the disk + * being removed, 0th copy is been taken care above and + * the below would take of the rest + */ + for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + + brelse(bh); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + continue; + } + memset(&disk_super->magic, 0, + sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } } ret = 0; - /* Notify udev that device has changed */ - if (bdev) + if (bdev) { + /* Notify udev that device has changed */ btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + } + error_brelse: brelse(bh); if (bdev) @@ -1883,7 +1941,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; - fs_devices->total_devices = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2092,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2105,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { @@ -2146,12 +2216,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_commit_transaction(trans, root); } + /* Update ctime/mtime for libblkid */ + update_dev_time(device_path); return ret; error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2490,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - kfree(map); - em->bdev = NULL; - /* once for the tree */ free_extent_map(em); /* once for us */ @@ -2922,6 +2992,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } + return 1; } @@ -2944,6 +3024,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -2982,6 +3065,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) memset(&bctl->stat, 0, sizeof(bctl->stat)); spin_unlock(&fs_info->balance_lock); again: + if (!counting) { + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -3881,7 +3969,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, u8 *ptr; array_size = btrfs_super_sys_array_size(super_copy); - if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; @@ -3986,6 +4075,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ + - sizeof(struct btrfs_item) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 start, u64 type) @@ -4035,6 +4134,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) @@ -4042,11 +4143,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { - btrfs_err(info, "invalid chunk type 0x%llx requested\n", + btrfs_err(info, "invalid chunk type 0x%llx requested", type); BUG_ON(1); } @@ -4213,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em = alloc_extent_map(); if (!em) { + kfree(map); ret = -ENOMEM; goto error; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = start; em->len = num_bytes; @@ -4258,7 +4365,6 @@ error_del_extent: /* One for the tree reference */ free_extent_map(em); error: - kfree(map); kfree(devices_info); return ret; } @@ -4294,7 +4400,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, if (em->start != chunk_offset || em->len != chunk_size) { btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" - " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + " %Lu-%Lu, found %Lu-%Lu", chunk_offset, chunk_size, em->start, em->len); free_extent_map(em); return -EINVAL; @@ -4470,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) write_unlock(&tree->map_tree.lock); if (!em) break; - kfree(em->bdev); /* once for us */ free_extent_map(em); /* once for the tree */ @@ -4496,14 +4601,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * and exit, so return 1 so the callers don't try to use other copies. */ if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, + btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, logical+len); return 1; } if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " - "%Lu-%Lu\n", logical, logical+len, em->start, + "%Lu-%Lu", logical, logical+len, em->start, em->start + em->len); free_extent_map(em); return 1; @@ -4684,7 +4789,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " - "found %Lu-%Lu\n", logical, em->start, + "found %Lu-%Lu", logical, em->start, em->start + em->len); free_extent_map(em); return -EINVAL; @@ -5274,6 +5379,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +{ + if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) + bio_endio_nodec(bio, err); + else + bio_endio(bio, err); + kfree(bbio); +} + static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; @@ -5314,12 +5428,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - /* - * We have original bio now. So increment bi_remaining to - * account for it in endio - */ - atomic_inc(&bio->bi_remaining); - bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -5336,9 +5444,8 @@ static void btrfs_end_bio(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(bbio); - bio_endio(bio, err); + btrfs_end_bbio(bbio, bio, err); } else if (!is_orig_bio) { bio_put(bio); } @@ -5501,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { + /* Shoud be the original bio. */ + WARN_ON(bio != bbio->orig_bio); + bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - kfree(bbio); - bio_endio(bio, -EIO); + + btrfs_end_bbio(bbio, bio, -EIO); } } @@ -5593,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; + bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; } submit_stripe_bio(root, bbio, bio, @@ -5734,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -ENOMEM; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = logical; em->len = length; @@ -5758,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { - kfree(map); free_extent_map(em); return -EIO; } @@ -5766,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = add_missing_dev(root, devid, uuid); if (!map->stripes[i].dev) { - kfree(map); free_extent_map(em); return -EIO; } @@ -6058,10 +6168,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->dev_root = fs_info->dev_root; - mutex_unlock(&fs_devices->device_list_mutex); + while (fs_devices) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); + + fs_devices = fs_devices->seed; + } } static void __btrfs_reset_dev_stats(struct btrfs_device *dev) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 80754f9dd3d..2aaa00c4781 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -190,11 +190,14 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); +#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 + struct btrfs_bio { atomic_t stripes_pending; struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; struct bio *orig_bio; + unsigned long flags; void *private; atomic_t error; int max_errors; @@ -255,6 +258,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) /* * Profile changing flags. When SOFT is set we won't relocate chunk if diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8e57191950c..b67d8fc8127 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws, if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "BTRFS: deflateInit failed\n"); - ret = -1; + ret = -EIO; goto out; } @@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws, out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws, printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); - ret = -1; + ret = -EIO; goto out; } @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -1; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this @@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws, zlib_deflateEnd(&workspace->def_strm); if (ret != Z_STREAM_END) { - ret = -1; + ret = -EIO; goto out; } if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { - ret = -1; + ret = -E2BIG; goto out; } @@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); - return -1; + return -EIO; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); @@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } } if (ret != Z_STREAM_END) - ret = -1; + ret = -EIO; else ret = 0; done: @@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); - return -1; + return -EIO; } while (bytes_left > 0) { @@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, total_out = workspace->inf_strm.total_out; if (total_out == buf_start) { - ret = -1; + ret = -EIO; break; } @@ -382,7 +382,7 @@ next: } if (ret != Z_STREAM_END && bytes_left != 0) - ret = -1; + ret = -EIO; else ret = 0; |