From 80ace85c915d0f41016f82917218997b72431258 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:32 +0200 Subject: btrfs: add RENAME_NOREPLACE RENAME_NOREPLACE is trivial to implement for most filesystems: switch over to ->rename2() and check for the supported flags. The rest is done by the VFS. Signed-off-by: Miklos Szeredi Cc: Chris Mason Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3668048e16f..3183742d6f0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8476,6 +8476,16 @@ out_notrans: return ret; } +static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; @@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, - .rename = btrfs_rename, + .rename2 = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, -- cgit v1.2.3-70-g09d2 From ce62003f690dff38d3164a632ec69efa15c32cbf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 24 Jul 2014 22:48:05 +0800 Subject: Btrfs: fix compressed write corruption on enospc When failing to allocate space for the whole compressed extent, we'll fallback to uncompressed IO, but we've forgotten to redirty the pages which belong to this compressed extent, and these 'clean' pages will simply skip 'submit' part and go to endio directly, at last we got data corruption as we write nothing. Signed-off-by: Liu Bo Tested-By: Martin Steigerwald Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3668048e16f..8ea7610fbaf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -709,6 +709,18 @@ retry: unlock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); + + /* + * we need to redirty the pages if we decide to + * fallback to uncompressed IO, otherwise we + * will not submit these pages down to lower + * layers. + */ + extent_range_redirty_for_io(inode, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + goto retry; } goto out_free; -- cgit v1.2.3-70-g09d2 From 8d875f95da43c6a8f18f77869f2ef26e9594fecc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 12 Aug 2014 10:47:42 -0700 Subject: btrfs: disable strict file flushes for renames and truncates Truncates and renames are often used to replace old versions of a file with new versions. Applications often expect this to be an atomic replacement, even if they haven't done anything to make sure the new version is fully on disk. Btrfs has strict flushing in place to make sure that renaming over an old file with a new file will fully flush out the new file before allowing the transaction commit with the rename to complete. This ordering means the commit code needs to be able to lock file pages, and there are a few paths in the filesystem where we will try to end a transaction with the page lock held. It's rare, but these things can deadlock. This patch removes the ordered flushes and switches to a best effort filemap_flush like ext4 uses. It's not perfect, but it should fix the deadlocks. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 6 --- fs/btrfs/disk-io.c | 32 ------------- fs/btrfs/file.c | 26 +--------- fs/btrfs/inode.c | 47 ++---------------- fs/btrfs/ordered-data.c | 123 ------------------------------------------------ fs/btrfs/ordered-data.h | 5 -- fs/btrfs/transaction.c | 33 +------------ fs/btrfs/transaction.h | 1 - 8 files changed, 6 insertions(+), 267 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4794923c410..43527fd7882 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -84,12 +84,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* - * list for tracking inodes that must be sent to disk before a - * rename or truncate commit - */ - struct list_head ordered_operations; - /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2a..d0ed9e664f7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root) -{ - struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - - list_splice_init(&t->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - list_del_init(&btrfs_inode->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - btrfs_invalidate_inodes(btrfs_inode->root); - - spin_lock(&root->fs_info->ordered_root_lock); - } - - spin_unlock(&root->fs_info->ordered_root_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); -} - static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; @@ -4093,8 +4063,6 @@ again: void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { - btrfs_destroy_ordered_operations(cur_trans, root); - btrfs_destroy_delayed_refs(cur_trans, root); cur_trans->state = TRANS_STATE_COMMIT_START; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1f2b99cb55e..d3afac292d6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1838,33 +1838,9 @@ out: int btrfs_release_file(struct inode *inode, struct file *filp) { - /* - * ordered_data_close is set by settattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. - */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(inode->i_mapping); - } if (filp->private_data) btrfs_ioctl_trans_end(filp); + filemap_flush(inode->i_mapping); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ea7610fbaf..73098328d04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - /* - * setattr is responsible for setting the ordered_data_close flag, - * but that is only tested during the last file release. That - * could happen well after the next commit, leaving a great big - * window where new writes may get lost if someone chooses to write - * to this file after truncating to zero - * - * The inode doesn't have any dirty data here, and so if we commit - * this is a noop. If someone immediately starts writing to the inode - * it is very likely we'll catch some of their writes in this - * transaction, and the commit will find this file on the ordered - * data list with good things to send down. - * - * This is a best effort solution, there is still a window where - * using truncate to replace the contents of the file will - * end up with a zero length file after a crash. - */ - if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) - btrfs_add_ordered_operation(trans, root, inode); - /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); - INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode) if (!root) goto free; - /* - * Make sure we're properly removed from the ordered operation - * lists. - */ - smp_mb(); - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(root->fs_info, "inode %llu still on the orphan list", @@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = 0; /* - * we're using rename to replace one file with another. - * and the replacement file is large. Start IO on it now so - * we don't add too much work to the end of the transaction + * we're using rename to replace one file with another. Start IO on it + * now so we don't add too much work to the end of the transaction */ - if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); /* close the racy window with snapshot create/destroy ioctl */ @@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ btrfs_pin_log_trans(root); } - /* - * make sure the inode gets flushed if it is replacing - * something. - */ - if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) - btrfs_add_ordered_operation(trans, root, old_inode); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7187b14faa6..963895c1f80 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, trace_btrfs_ordered_extent_remove(inode, entry); - /* - * we have no more ordered extents for this inode and - * no dirty pages. We can safely remove it from the - * list of ordered extents - */ - if (RB_EMPTY_ROOT(&tree->tree) && - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (!root->nr_ordered_extents) { spin_lock(&root->fs_info->ordered_root_lock); BUG_ON(list_empty(&root->ordered_root)); @@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) mutex_unlock(&fs_info->ordered_operations_mutex); } -/* - * this is used during transaction commit to write all the inodes - * added to the ordered operation list. These files must be fully on - * disk before the transaction commits. - * - * we have two modes here, one is to just start the IO via filemap_flush - * and the other is to wait for all the io. When we wait, we have an - * extra check to make sure the ordered operation list really is empty - * before we return - */ -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait) -{ - struct btrfs_inode *btrfs_inode; - struct inode *inode; - struct btrfs_transaction *cur_trans = trans->transaction; - struct list_head splice; - struct list_head works; - struct btrfs_delalloc_work *work, *next; - int ret = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); - - mutex_lock(&root->fs_info->ordered_extent_flush_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - list_splice_init(&cur_trans->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - inode = &btrfs_inode->vfs_inode; - - list_del_init(&btrfs_inode->ordered_operations); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(inode); - if (!inode) - continue; - - if (!wait) - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - work = btrfs_alloc_delalloc_work(inode, wait, 1); - if (!work) { - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) - list_add_tail(&btrfs_inode->ordered_operations, - &splice); - list_splice_tail(&splice, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_work(root->fs_info->flush_workers, - &work->work); - - cond_resched(); - spin_lock(&root->fs_info->ordered_root_lock); - } - spin_unlock(&root->fs_info->ordered_root_lock); -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } - mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); - return ret; -} - /* * Used to start IO or wait for a given ordered extent to finish. * @@ -1120,42 +1033,6 @@ out: return index; } - -/* - * add a given inode to the list of inodes that must be fully on - * disk before a transaction commit finishes. - * - * This basically gives us the ext3 style data=ordered mode, and it is mostly - * used to make sure renamed files are fully on disk. - * - * It is a noop if the inode is already fully on disk. - * - * If trans is not null, we'll do a friendly check for a transaction that - * is already flushing things and force the IO down ourselves. - */ -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - u64 last_mod; - - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); - - /* - * if this file hasn't been changed since the last transaction - * commit, we can safely return without doing anything - */ - if (last_mod <= root->fs_info->last_trans_committed) - return; - - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_root_lock); -} - int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 246897058ef..d81a274d621 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait); -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5f379affdf2..d89c6d3542c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -218,7 +218,6 @@ loop: spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } -static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = btrfs_run_delayed_items(trans, root); - if (ret) - return ret; - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(trans, root, 1); - - return ret; -} - static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) @@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *prev_trans = NULL; int ret; - ret = btrfs_run_ordered_operations(trans, root, 0); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - return ret; - } - /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; @@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7dd558ed071..579be51b27e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -55,7 +55,6 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct list_head ordered_operations; struct list_head pending_chunks; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; -- cgit v1.2.3-70-g09d2 From 7a5c3c9be1059feed0e470c6dc0994dcaed4f12c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 17 Jun 2014 18:58:59 +0800 Subject: Btrfs: fix put dio bio twice when we submit dio bio fail The caller of btrfs_submit_direct_hook() will put the original dio bio when btrfs_submit_direct_hook() return a error number, so we needn't put the original bio in btrfs_submit_direct_hook(). Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73098328d04..33c05188cbf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7306,10 +7306,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); - if (ret) { - bio_put(orig_bio); + if (ret) return -EIO; - } if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; @@ -7326,6 +7324,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; + bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; atomic_inc(&dip->pending_bios); -- cgit v1.2.3-70-g09d2 From 9a025a0860ccc0f02af153c966bc1f83e5d9fc62 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:13 +0800 Subject: Btrfs: fix wrong write range for filemap_fdatawrite_range() filemap_fdatawrite_range() expect the third arg to be @end not @len, fix it. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 33c05188cbf..73fadc7ead0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7533,7 +7533,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, count = iov_iter_count(iter); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, count); + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); if (rw & WRITE) { /* -- cgit v1.2.3-70-g09d2 From e2eca69dc6c09d968d69312b9899968a9b03a4a9 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:14 +0800 Subject: Btrfs: fix wrong extent mapping for DirectIO btrfs_next_leaf() will use current leaf's last key to search and then return a bigger one. So it may still return a file extent item that is smaller than expected value and we will get an overflow here for @em->len. This is easy to reproduce for Btrfs Direct writting, it did not cause any problem, because writting will re-insert right mapping later. However, by hacking code to make DIO support compression, wrong extent mapping is kept and it encounter merging failure(EEXIST) quickly. Fix this problem by looping to find next file extent item that is bigger than @start or we could not find anything more. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73fadc7ead0..a3c6e76f5a4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6275,6 +6275,8 @@ next: goto not_found; if (start + len <= found_key.offset) goto not_found; + if (start > found_key.offset) + goto next; em->start = start; em->orig_start = start; em->len = found_key.offset - start; -- cgit v1.2.3-70-g09d2 From 9c3b306e1c9e6be4be09e99a8fe2227d1005effc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 31 Jul 2014 14:41:07 +0100 Subject: Btrfs: race free update of commit root for ro snapshots This is a better solution for the problem addressed in the following commit: Btrfs: update commit root on snapshot creation after orphan cleanup (3821f348889e506efbd268cc8149e0ebfa47c4e5) The previous solution wasn't the best because of 2 reasons: 1) It added another full transaction commit, which is more expensive than just swapping the commit root with the root; 2) If a reboot happened after the first transaction commit (the one that creates the snapshot) and before the second transaction commit, then we would end up with the same problem if a send using that snapshot was requested before the first transaction commit after the reboot. This change addresses those 2 issues. The second issue is addressed by switching the commit root in the dentry lookup VFS callback, which is also called by the snapshot/subvol creation ioctl and performs orphan cleanup if needed. Like the vfs, the ioctl locks the parent inode too, preventing race issues between a dentry lookup and snapshot creation. Cc: Alex Lyakas Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 ++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.c | 33 --------------------------------- 2 files changed, 36 insertions(+), 33 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3c6e76f5a4..6dd6e50d143 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5181,6 +5181,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } + /* + * If orphan cleanup did remove any orphans, it means the tree + * was modified and therefore the commit root is not the same as + * the current root anymore. This is a problem, because send + * uses the commit root and therefore can see inode items that + * don't exist in the current root anymore, and for example make + * calls to btrfs_iget, which will do tree lookups based on the + * current root and not on the commit root. Those lookups will + * fail, returning a -ESTALE error, and making send fail with + * that error. So make sure a send does not see any orphans we + * have just removed, and that it will see the same inodes + * regardless of whether a transaction commit happened before + * it started (meaning that the commit root will be the same as + * the current root) or not. + */ + if (sub_root->node != sub_root->commit_root) { + u64 sub_flags = btrfs_root_flags(&sub_root->root_item); + + if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) { + struct extent_buffer *eb; + + /* + * Assert we can't have races between dentry + * lookup called through the snapshot creation + * ioctl and the VFS. + */ + ASSERT(mutex_is_locked(&dir->i_mutex)); + + down_write(&root->fs_info->commit_root_sem); + eb = sub_root->commit_root; + sub_root->commit_root = + btrfs_root_node(sub_root); + up_write(&root->fs_info->commit_root_sem); + free_extent_buffer(eb); + } + } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 47aceb494d1..845287ca59c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - ret = btrfs_orphan_cleanup(pending_snapshot->snap); - if (ret) - goto fail; - - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); -- cgit v1.2.3-70-g09d2 From 5762b5c958abbecb7fb9f4596a6476d1ce91ecf6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Aug 2014 00:10:32 +0100 Subject: Btrfs: ensure tmpfile inode is always persisted with link count of 0 If we open a file with O_TMPFILE, don't do any further operation on it (so that the inode item isn't updated) and then force a transaction commit, we get a persisted inode item with a link count of 1, and not 0 as it should be. Steps to reproduce it (requires a modern xfs_io with -T support): $ mkfs.btrfs -f /dev/sdd $ mount -o /dev/sdd /mnt $ xfs_io -T /mnt & $ sync Then btrfs-debug-tree shows the inode item with a link count of 1: $ btrfs-debug-tree /dev/sdd (...) fs tree key (FS_TREE ROOT_ITEM 0) leaf 29556736 items 4 free space 15851 generation 6 owner 5 fs uuid f164d01b-1b92-481d-a4e4-435fb0f843d0 chunk uuid 0e3d0e56-bcca-4a1c-aa5f-cec2c6f4f7a6 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 inode generation 3 transid 6 size 0 block group 0 mode 40755 links 1 item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 inode ref index 0 namelen 2 name: .. item 2 key (257 INODE_ITEM 0) itemoff 15951 itemsize 160 inode generation 6 transid 6 size 0 block group 0 mode 100600 links 1 item 3 key (ORPHAN ORPHAN_ITEM 257) itemoff 15951 itemsize 0 orphan item checksum tree key (CSUM_TREE ROOT_ITEM 0) (...) Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dd6e50d143..57c3129ee2a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5641,6 +5641,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); } + /* + * O_TMPFILE, set link count to 0, so that after this point, + * we fill in an inode item with the correct link count. + */ + if (!name) + set_nlink(inode, 0); + /* * we have to initialize this early, so we can reclaim the inode * number if we fail afterwards in this function. @@ -9007,6 +9014,14 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; + /* + * We set number of links to 0 in btrfs_new_inode(), and here we set + * it to 1 because d_tmpfile() will issue a warning if the count is 0, + * through: + * + * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); mark_inode_dirty(inode); -- cgit v1.2.3-70-g09d2 From 7064dd5c36187725e7ccfd837e07678ae435d3f5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Aug 2014 02:47:05 +0100 Subject: Btrfs: don't monopolize a core when evicting inode If an inode has a very large number of extent maps, we can spend a lot of time freeing them, which triggers a soft lockup warning. Therefore reschedule if we need to when freeing the extent maps while evicting the inode. I could trigger this all the time by running xfstests/generic/299 on a file system with the no-holes feature enabled. That test creates an inode with 11386677 extent maps. $ mkfs.btrfs -f -O no-holes $TEST_DEV $ MKFS_OPTIONS="-O no-holes" ./check generic/299 generic/299 382s ... Message from syslogd@debian-vm3 at Aug 7 10:44:29 ... kernel:[85304.208017] BUG: soft lockup - CPU#0 stuck for 22s! [umount:25330] 384s Ran: generic/299 Passed all 1 tests $ dmesg (...) [86304.300017] BUG: soft lockup - CPU#0 stuck for 23s! [umount:25330] (...) [86304.300036] Call Trace: [86304.300036] [] __slab_free+0x54/0x295 [86304.300036] [] ? free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [] kmem_cache_free+0x282/0x2a0 [86304.300036] [] free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [] btrfs_evict_inode+0xd5/0x660 [btrfs] [86304.300036] [] ? __inode_wait_for_writeback+0x6d/0xc0 [86304.300036] [] ? _raw_spin_unlock+0x2b/0x40 [86304.300036] [] evict+0xab/0x180 [86304.300036] [] dispose_list+0x3e/0x60 [86304.300036] [] evict_inodes+0xf4/0x110 [86304.300036] [] generic_shutdown_super+0x53/0x110 [86304.300036] [] kill_anon_super+0x16/0x30 [86304.300036] [] btrfs_kill_super+0x1a/0xa0 [btrfs] [86304.300036] [] deactivate_locked_super+0x59/0x80 [86304.300036] [] deactivate_super+0x4e/0x70 [86304.300036] [] mntput_no_expire+0x174/0x1f0 [86304.300036] [] ? mntput_no_expire+0x17/0x1f0 [86304.300036] [] SyS_umount+0x97/0x100 (...) Signed-off-by: Filipe Manana Reviewed-by: Satoru Takeuchi Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 57c3129ee2a..2ac260d41cc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4674,6 +4674,11 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_bit(EXTENT_FLAG_LOGGING, &em->flags); remove_extent_mapping(map_tree, em); free_extent_map(em); + if (need_resched()) { + write_unlock(&map_tree->lock); + cond_resched(); + write_lock(&map_tree->lock); + } } write_unlock(&map_tree->lock); @@ -4696,6 +4701,7 @@ static void evict_inode_truncate_pages(struct inode *inode) &cached_state, GFP_NOFS); free_extent_state(state); + cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); -- cgit v1.2.3-70-g09d2 From 51f395ad4058883e4273b02fdebe98072dbdc0d2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 8 Aug 2014 13:06:20 +0800 Subject: btrfs: Use right extent length when inserting overlap extent map. When current btrfs finds that a new extent map is going to be insereted but failed with -EEXIST, it will try again to insert the extent map but with the length of sectorsize. This is OK if we don't enable 'no-holes' feature since all extent space is continuous, we will not go into the not found->insert routine. But if we enable 'no-holes' feature, it will make things out of control. e.g. in 4K sectorsize, we pass the following args to btrfs_get_extent(): btrfs_get_extent() args: start: 27874 len 4100 28672 27874 28672 27874+4100 32768 |-----------------------| |---------hole--------------------|---------data----------| 1) not found and insert Since no extent map containing the range, btrfs_get_extent() will go into the not_found and insert routine, which will try to insert the extent map (27874, 27847 + 4100). 2) first overlap But it overlaps with (28672, 32768) extent, so -EEXIST will be returned by add_extent_mapping(). 3) retry but still overlap After catching the -EEXIST, then btrfs_get_extent() will try insert it again but with 4K length, which still overlaps, so -EEXIST will be returned. This makes the following patch fail to punch hole. d77815461f047e561f77a07754ae923ade597d4e btrfs: Avoid trucating page or punching hole in a already existed hole. This patch will use the right length, which is the (exsisting->start - em->start) to insert, making the above patch works in 'no-holes' mode. Also, some small code style problems in above patch is fixed too. Reported-by: Filipe David Manana Signed-off-by: Qu Wenruo Reviewed-by: Filipe David Manana Tested-by: Filipe David Manana Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77e33534c7d..f15c13f9701 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2215,7 +2215,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); lockend = round_down(offset + len, BTRFS_I(inode)->root->sectorsize) - 1; same_page = ((offset >> PAGE_CACHE_SHIFT) == @@ -2276,7 +2276,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start + tail_len, 0, 1); if (ret) goto out_only_mutex; - } + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2ac260d41cc..ae98df67950 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6146,14 +6146,14 @@ out_fail: static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, - u64 map_start, u64 map_len) + u64 map_start) { u64 start_diff; BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); start_diff = map_start - em->start; em->start = map_start; - em->len = map_len; + em->len = existing->start - em->start; if (em->block_start < EXTENT_MAP_LAST_BYTE && !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; @@ -6441,8 +6441,7 @@ insert: em->len); if (existing) { err = merge_extent_mapping(em_tree, existing, - em, start, - root->sectorsize); + em, start); free_extent_map(existing); if (err) { free_extent_map(em); -- cgit v1.2.3-70-g09d2 From 9e0af23764344f7f1b68e4eefbe7dc865018b63d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Aug 2014 23:36:53 +0800 Subject: Btrfs: fix task hang under heavy compressed write This has been reported and discussed for a long time, and this hang occurs in both 3.15 and 3.16. Btrfs now migrates to use kernel workqueue, but it introduces this hang problem. Btrfs has a kind of work queued as an ordered way, which means that its ordered_func() must be processed in the way of FIFO, so it usually looks like -- normal_work_helper(arg) work = container_of(arg, struct btrfs_work, normal_work); work->func() <---- (we name it work X) for ordered_work in wq->ordered_list ordered_work->ordered_func() ordered_work->ordered_free() The hang is a rare case, first when we find free space, we get an uncached block group, then we go to read its free space cache inode for free space information, so it will file a readahead request btrfs_readpages() for page that is not in page cache __do_readpage() submit_extent_page() btrfs_submit_bio_hook() btrfs_bio_wq_end_io() submit_bio() end_workqueue_bio() <--(ret by the 1st endio) queue a work(named work Y) for the 2nd also the real endio() So the hang occurs when work Y's work_struct and work X's work_struct happens to share the same address. A bit more explanation, A,B,C -- struct btrfs_work arg -- struct work_struct kthread: worker_thread() pick up a work_struct from @worklist process_one_work(arg) worker->current_work = arg; <-- arg is A->normal_work worker->current_func(arg) normal_work_helper(arg) A = container_of(arg, struct btrfs_work, normal_work); A->func() A->ordered_func() A->ordered_free() <-- A gets freed B->ordered_func() submit_compressed_extents() find_free_extent() load_free_space_inode() ... <-- (the above readhead stack) end_workqueue_bio() btrfs_queue_work(work C) B->ordered_free() As if work A has a high priority in wq->ordered_list and there are more ordered works queued after it, such as B->ordered_func(), its memory could have been freed before normal_work_helper() returns, which means that kernel workqueue code worker_thread() still has worker->current_work pointer to be work A->normal_work's, ie. arg's address. Meanwhile, work C is allocated after work A is freed, work C->normal_work and work A->normal_work are likely to share the same address(I confirmed this with ftrace output, so I'm not just guessing, it's rare though). When another kthread picks up work C->normal_work to process, and finds our kthread is processing it(see find_worker_executing_work()), it'll think work C as a collision and skip then, which ends up nobody processing work C. So the situation is that our kthread is waiting forever on work C. Besides, there're other cases that can lead to deadlock, but the real problem is that all btrfs workqueue shares one work->func, -- normal_work_helper, so this makes each workqueue to have its own helper function, but only a wraper pf normal_work_helper. With this patch, I no long hit the above hang. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 44 ++++++++++++++++++++++++++++++++-------- fs/btrfs/async-thread.h | 28 ++++++++++++++++++++++++- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 53 ++++++++++++++++++++++++++---------------------- fs/btrfs/extent-tree.c | 7 ++++--- fs/btrfs/inode.c | 35 +++++++++++++++++++++----------- fs/btrfs/ordered-data.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/raid56.c | 9 +++++--- fs/btrfs/reada.c | 3 ++- fs/btrfs/scrub.c | 14 +++++++------ fs/btrfs/volumes.c | 3 ++- 12 files changed, 141 insertions(+), 61 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5a201d81049..fbd76ded9a3 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "async-thread.h" #include "ctree.h" @@ -55,8 +54,39 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static inline struct __btrfs_workqueue -*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +static void normal_work_helper(struct btrfs_work *work); + +#define BTRFS_WORK_HELPER(name) \ +void btrfs_##name(struct work_struct *arg) \ +{ \ + struct btrfs_work *work = container_of(arg, struct btrfs_work, \ + normal_work); \ + normal_work_helper(work); \ +} + +BTRFS_WORK_HELPER(worker_helper); +BTRFS_WORK_HELPER(delalloc_helper); +BTRFS_WORK_HELPER(flush_delalloc_helper); +BTRFS_WORK_HELPER(cache_helper); +BTRFS_WORK_HELPER(submit_helper); +BTRFS_WORK_HELPER(fixup_helper); +BTRFS_WORK_HELPER(endio_helper); +BTRFS_WORK_HELPER(endio_meta_helper); +BTRFS_WORK_HELPER(endio_meta_write_helper); +BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(rmw_helper); +BTRFS_WORK_HELPER(endio_write_helper); +BTRFS_WORK_HELPER(freespace_write_helper); +BTRFS_WORK_HELPER(delayed_meta_helper); +BTRFS_WORK_HELPER(readahead_helper); +BTRFS_WORK_HELPER(qgroup_rescan_helper); +BTRFS_WORK_HELPER(extent_refs_helper); +BTRFS_WORK_HELPER(scrub_helper); +BTRFS_WORK_HELPER(scrubwrc_helper); +BTRFS_WORK_HELPER(scrubnc_helper); + +static struct __btrfs_workqueue * +__btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); } -static void normal_work_helper(struct work_struct *arg) +static void normal_work_helper(struct btrfs_work *work) { - struct btrfs_work *work; struct __btrfs_workqueue *wq; int need_order = 0; - work = container_of(arg, struct btrfs_work, normal_work); /* * We should not touch things inside work in the following cases: * 1) after work->func() if it has no ordered_free @@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg) trace_btrfs_all_work_done(work); } -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free) @@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work, work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, normal_work_helper); + INIT_WORK(&work->normal_work, uniq_func); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 9c6b66d15fb..e9e31c94758 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -19,12 +19,14 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ +#include struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; @@ -38,11 +40,35 @@ struct btrfs_work { unsigned long flags; }; +#define BTRFS_WORK_HELPER_PROTO(name) \ +void btrfs_##name(struct work_struct *arg) + +BTRFS_WORK_HELPER_PROTO(worker_helper); +BTRFS_WORK_HELPER_PROTO(delalloc_helper); +BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); +BTRFS_WORK_HELPER_PROTO(cache_helper); +BTRFS_WORK_HELPER_PROTO(submit_helper); +BTRFS_WORK_HELPER_PROTO(fixup_helper); +BTRFS_WORK_HELPER_PROTO(endio_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); +BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(rmw_helper); +BTRFS_WORK_HELPER_PROTO(endio_write_helper); +BTRFS_WORK_HELPER_PROTO(freespace_write_helper); +BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); +BTRFS_WORK_HELPER_PROTO(readahead_helper); +BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); +BTRFS_WORK_HELPER_PROTO(extent_refs_helper); +BTRFS_WORK_HELPER_PROTO(scrub_helper); +BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); +BTRFS_WORK_HELPER_PROTO(scrubnc_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index da775bfdebc..a2e90f855d7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, - NULL, NULL); + btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, + btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c99a414813c..a1d36e62179 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -39,7 +39,6 @@ #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" -#include "async-thread.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" @@ -693,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->error = err; - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_work(fs_info->endio_meta_write_workers, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_work(fs_info->endio_freespace_worker, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_write_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + wq = fs_info->endio_meta_write_workers; + func = btrfs_endio_meta_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else if (end_io_wq->metadata) - btrfs_queue_work(fs_info->endio_meta_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else if (end_io_wq->metadata) { + wq = fs_info->endio_meta_workers; + func = btrfs_endio_meta_helper; + } else { + wq = fs_info->endio_workers; + func = btrfs_endio_helper; + } } + + btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_queue_work(wq, &end_io_wq->work); } /* @@ -828,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - btrfs_init_work(&async->work, run_one_async_start, + btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, run_one_async_done, run_one_async_free); async->bio_flags = bio_flags; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5524434da05..3efe1c3877b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->sync = 0; init_completion(&async->wait); - btrfs_init_work(&async->work, delayed_ref_async_start, - NULL, NULL); + btrfs_init_work(&async->work, btrfs_extent_refs_helper, + delayed_ref_async_start, NULL, NULL); btrfs_queue_work(root->fs_info->extent_workers, &async->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ae98df67950..3d020d6d9ac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1096,8 +1096,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - btrfs_init_work(&async_cow->work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_cow->work, + btrfs_delalloc_helper, + async_cow_start, async_cow_submit, + async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; @@ -1881,7 +1883,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_fixup_helper, + btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; @@ -2822,7 +2825,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *workers; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2831,13 +2835,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); + if (btrfs_is_free_space_inode(inode)) { + wq = root->fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = root->fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } - if (btrfs_is_free_space_inode(inode)) - workers = root->fs_info->endio_freespace_worker; - else - workers = root->fs_info->endio_write_workers; - btrfs_queue_work(workers, &ordered_extent->work); + btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, + NULL); + btrfs_queue_work(wq, &ordered_extent->work); return 0; } @@ -7208,7 +7216,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, btrfs_endio_write_helper, + finish_ordered_fn, NULL, NULL); btrfs_queue_work(root->fs_info->endio_write_workers, &ordered->work); out_test: @@ -8535,7 +8544,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + WARN_ON_ONCE(!inode); + btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, + btrfs_run_delalloc_work, NULL, NULL); return work; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 963895c1f80..ac734ec4cc2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, + btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(root->fs_info->flush_workers, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8abe45524de..ded5c601d91 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2720,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4a88f073fdd..0a6b6e4bcbb 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1416,7 +1416,8 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + rmw_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio) static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + read_rebuild_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, btrfs_rmw_helper, + unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 09230cf3a24..20408c6b665 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); + btrfs_init_work(&rmw->work, btrfs_readahead_helper, + reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 23d3f6e6a48..f4a41f37be2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, - NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrub_helper, + scrub_bio_end_io_worker, NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -999,8 +999,8 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, - NULL, NULL); + btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, + scrub_fixup_nodatasum, NULL, NULL); btrfs_queue_work(fs_info->scrub_workers, &fixup_nodatasum->work); goto out; @@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, + scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -3214,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); + btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, + copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); btrfs_queue_work(fs_info->scrub_nocow_workers, &nocow_ctx->work); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d4ce53d756..340a92d08e8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5854,7 +5854,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); + btrfs_init_work(&dev->work, btrfs_submit_helper, + pending_bios_fn, NULL, NULL); return dev; } -- cgit v1.2.3-70-g09d2 From d9f85963e3f7f5582552fdae54a2b89d6c62daf5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 25 Aug 2014 10:43:00 +0100 Subject: Btrfs: fix corruption after write/fsync failure + fsync + log recovery While writing to a file, in inode.c:cow_file_range() (and same applies to submit_compressed_extents()), after reserving an extent for the file data, we create a new extent map for the written range and insert it into the extent map cache. After that, we create an ordered operation, but if it fails (due to a transient/temporary-ENOMEM), we return without dropping that extent map, which points to a reserved extent that is freed when we return. A subsequent incremental fsync (when the btrfs inode doesn't have the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and logs a file extent item based on that extent map, which points to a disk extent that doesn't contain valid data - it was freed by us earlier, at this point it might contain any random/garbage data. Therefore, if we reach an error condition when cowing a file range after we added the new extent map to the cache, drop it from the cache before returning. Some sequence of steps that lead to this: $ mkfs.btrfs -f /dev/sdd $ mount -o commit=9999 /dev/sdd /mnt $ cd /mnt $ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo $ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096" $ sync $ od -t x1 foo 0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 * 0020000 $ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo # Now this write + fsync fail with -ENOMEM, which was returned by # btrfs_add_ordered_extent() in inode.c:cow_file_range(). $ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo $ xfs_io -c "fsync" foo fsync: Cannot allocate memory # Now do a new write + fsync, which will succeed. Our previous # -ENOMEM was a transient/temporary error. $ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo $ xfs_io -c "fsync" foo # Our file content (in page cache) is now: $ od -t x1 foo 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 * 0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # Now reboot the machine, and mount the fs, so that fsync log replay # takes place. # The file content is now weird, in particular the first 8Kb, which # do not match our data before nor after the sync command above. $ od -t x1 foo 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # In fact these first 4Kb are a duplicate of the last 4kb block. # The last write got an extent map/file extent item that points to # the same disk extent that we got in the write+fsync that failed # with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to # verify that: $ btrfs-debug-tree /dev/sdd (...) item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53 extent data disk byte 12582912 nr 8192 extent data offset 0 nr 8192 ram 8192 item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 8192 ram 8192 item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53 extent data disk byte 12582912 nr 4096 extent data offset 0 nr 4096 ram 4096 $ umount /dev/sdd $ btrfsck /dev/sdd Checking filesystem on /dev/sdd UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f checking extents extent item 12582912 has multiple extent items ref mismatch on [12582912 4096] extent item 1, found 2 Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192 backpointer mismatch on [12582912 4096] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots root 5 inode 257 errors 1000, some csum missing found 131074 bytes used err is 1 total csum bytes: 4 total tree bytes: 131072 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 123404 file data blocks allocated: 274432 referenced 274432 Btrfs v3.14.1-96-gcc7fd5a-dirty Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d020d6d9ac..7313571e186 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -778,8 +778,12 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) + if (ret) { + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); goto out_free_reserve; + } /* * clear dirty, set writeback and unlock the pages. @@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode, ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) - goto out_reserve; + goto out_drop_extent_cache; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); if (ret) - goto out_reserve; + goto out_drop_extent_cache; } if (disk_num_bytes < cur_alloc_size) @@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode, out: return ret; +out_drop_extent_cache: + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: -- cgit v1.2.3-70-g09d2 From dac5705cad20070a70bb028ca52e1f0bc157b42d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Aug 2014 20:54:26 +0100 Subject: Btrfs: fix crash while doing a ranged fsync While doing a ranged fsync, that is, one whose range doesn't cover the whole possible file range (0 to LLONG_MAX), we can crash under certain circumstances with a trace like the following: [41074.641913] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1 (...) [41074.643886] RIP: 0010:[] [] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs] (...) [41074.644919] Stack: (...) [41074.644919] Call Trace: [41074.644919] [] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs] [41074.644919] [] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs] [41074.644919] [] btrfs_log_inode+0x2f9/0x970 [btrfs] [41074.644919] [] ? sched_clock_local+0x25/0xa0 [41074.644919] [] ? mutex_unlock+0xe/0x10 [41074.644919] [] ? trace_hardirqs_on+0xd/0x10 [41074.644919] [] btrfs_log_inode_parent+0x1ef/0x560 [btrfs] [41074.644919] [] ? dget_parent+0x5/0x180 [41074.644919] [] btrfs_log_dentry_safe+0x51/0x80 [btrfs] [41074.644919] [] btrfs_sync_file+0x1ba/0x3e0 [btrfs] [41074.644919] [] vfs_fsync_range+0x1b/0x30 (...) The necessary conditions that lead to such crash are: * an incremental fsync (when the inode doesn't have the BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged a file extent item ending at offset X; * the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due to a file truncate operation that reduces the file to a size smaller than X; * a ranged fsync call happens (via an msync for example), with a range that doesn't cover the whole file and the end of this range, lets call it Y, is smaller than X; * btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and calls btrfs_truncate_inode_items() to remove all items from the log tree that are associated with our file; * btrfs_truncate_inode_items() removes all of the inode's items, and the lowest file extent item it removed is the one ending at offset X, where X > 0 and X > Y - before returning, it calls btrfs_ordered_update_i_size() with an offset parameter set to X; * btrfs_ordered_update_i_size() sees that X is greater then the current ordered size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing ordered operation with a range covering the offset X, calling a BUG_ON() if such ordered operation exists. This assumption is made because the disk_i_size is only increased after the corresponding file extent item is added to the btree (btrfs_finish_ordered_io); * But because our fsync covers only a limited range, such an ordered extent might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered extent to finish when calling btrfs_wait_ordered_range(); And then by the time btrfs_ordered_update_i_size() is called, via: btrfs_sync_file() -> btrfs_log_dentry_safe() -> btrfs_log_inode_parent() -> btrfs_log_inode() -> btrfs_truncate_inode_items() -> btrfs_ordered_update_i_size() We hit the BUG_ON(), which could never happen if the fsync range covered the whole possible file range (0 to LLONG_MAX), as we would wait for all ordered extents to finish before calling btrfs_truncate_inode_items(). So just don't call btrfs_ordered_update_i_size() if we're removing the inode's items from a log tree, which isn't supposed to change the in memory inode's disk_i_size. Issue found while running xfstests/generic/127 (happens very rarely for me), more specifically via the fsx calls that use memory mapped IO (and issue msync calls). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7313571e186..88823f4ca45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4248,7 +4248,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; -- cgit v1.2.3-70-g09d2 From b0d5d10f41a0f1cd839408dd94427f2db3553bca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 8 Sep 2014 13:08:51 -0700 Subject: Btrfs: use insert_inode_locked4 for inode creation Btrfs was inserting inodes into the hash table before we had fully set the inode up on disk. This leaves us open to rare races that allow two different inodes in memory for the same [root, inode] pair. This patch fixes things by using insert_inode_locked4 to insert an I_NEW inode and unlock_new_inode when we're ready for the rest of the kernel to use the inode. It also makes sure to init the operations pointers on the inode before going into the error handling paths. Signed-off-by: Chris Mason Reported-by: Al Viro --- fs/btrfs/inode.c | 176 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 67 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 88823f4ca45..214b936bdd3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5634,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) return ret; } +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + args.location = &BTRFS_I(inode)->location; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, @@ -5726,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[1] = name_len + sizeof(*ref); } + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) + goto fail; + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) - goto fail; + goto fail_unlock; inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); @@ -5752,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { @@ -5767,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5782,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_ino(inode), root->root_key.objectid, ret); return inode; + +fail_unlock: + unlock_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; @@ -5916,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ - inode->i_op = &btrfs_special_inode_operations; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + init_special_inode(inode, inode->i_mode, rdev); + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - drop_inode = 1; - else { - init_special_inode(inode, inode->i_mode, rdev); + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) { + goto out_unlock_inode; + } else { btrfs_update_inode(trans, root, inode); + unlock_new_inode(inode); d_instantiate(dentry, inode); } + out_unlock: btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); @@ -5947,6 +5964,12 @@ out_unlock: iput(inode); } return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_create(struct inode *dir, struct dentry *dentry, @@ -5981,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } drop_inode_on_err = 1; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_unlock; - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5998,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - goto out_unlock; + goto out_unlock_inode; - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + unlock_new_inode(inode); d_instantiate(dentry, inode); out_unlock: @@ -6017,6 +6040,11 @@ out_unlock: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6124,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } drop_on_err = 1; + /* these must be set before we unlock the inode */ + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_fail; - - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; + goto out_fail_inode; btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); if (err) - goto out_fail; + goto out_fail_inode; err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, 0, index); if (err) - goto out_fail; + goto out_fail_inode; d_instantiate(dentry, inode); + /* + * mkdir is special. We're unlocking after we call d_instantiate + * to avoid a race with nfsd calling d_instantiate. + */ + unlock_new_inode(inode); drop_on_err = 0; out_fail: @@ -6152,6 +6185,10 @@ out_fail: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_fail_inode: + unlock_new_inode(inode); + goto out_fail; } /* helper for btfs_get_extent. Given an existing extent in the tree, @@ -8107,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) @@ -8757,12 +8795,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -8771,23 +8803,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - if (drop_inode) - goto out_unlock; + goto out_unlock_inode; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - drop_inode = 1; - goto out_unlock; + goto out_unlock_inode; } key.objectid = btrfs_ino(inode); key.offset = 0; @@ -8796,9 +8827,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { - drop_inode = 1; btrfs_free_path(path); - goto out_unlock; + goto out_unlock_inode; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -8822,12 +8852,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); - if (err) + if (err) { drop_inode = 1; + goto out_unlock_inode; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); out_unlock: - if (!err) - d_instantiate(dentry, inode); btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); @@ -8835,6 +8868,11 @@ out_unlock: } btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -9018,14 +9056,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } - ret = btrfs_init_inode_security(trans, inode, dir, NULL); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) - goto out; - inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -9033,9 +9063,16 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out_inode; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out_inode; ret = btrfs_orphan_add(trans, inode); if (ret) - goto out; + goto out_inode; /* * We set number of links to 0 in btrfs_new_inode(), and here we set @@ -9045,6 +9082,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); + unlock_new_inode(inode); d_tmpfile(dentry, inode); mark_inode_dirty(inode); @@ -9054,8 +9092,12 @@ out: iput(inode); btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); - return ret; + +out_inode: + unlock_new_inode(inode); + goto out; + } static const struct inode_operations btrfs_dir_inode_operations = { -- cgit v1.2.3-70-g09d2 From 962a298f35110edd8f326814ae41a3dd306ecb64 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Jun 2014 18:41:45 +0200 Subject: btrfs: kill the key type accessor helpers btrfs_set_key_type and btrfs_key_type are used inconsistently along with open coded variants. Other members of btrfs_key are accessed directly without any helpers anyway. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 4 ++-- fs/btrfs/delayed-inode.c | 8 ++++---- fs/btrfs/dir-item.c | 12 ++++++------ fs/btrfs/export.c | 4 ++-- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 12 ++++++------ fs/btrfs/file.c | 4 ++-- fs/btrfs/inode-item.c | 12 ++++++------ fs/btrfs/inode.c | 28 ++++++++++++++-------------- fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/orphan.c | 4 ++-- fs/btrfs/print-tree.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/tree-log.c | 6 +++--- fs/btrfs/volumes.c | 4 ++-- fs/btrfs/xattr.c | 4 ++-- 17 files changed, 60 insertions(+), 60 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 54a201dac7f..cfe8566e6e3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1193,7 +1193,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, unsigned long ptr; key.objectid = inode_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = start_off; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1233,7 +1233,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, ret = -ENOENT; if (found_key.objectid != inode_objectid) break; - if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + if (found_key.type != BTRFS_INODE_EXTREF_KEY) break; ret = 0; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a2e90f855d7..054577bddaf 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, int ret; key.objectid = node->inode_id; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) @@ -1099,7 +1099,7 @@ err_out: search: btrfs_release_path(path); - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, } delayed_item->key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); + delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; dir_item = (struct btrfs_dir_item *)delayed_item->data; @@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, return PTR_ERR(node); item_key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); + item_key.type = BTRFS_DIR_INDEX_KEY; item_key.offset = index; ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a0691df5dce..fc8df866e91 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); data_size = sizeof(*dir_item) + name_len + data_len; @@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root u32 data_size; key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); path = btrfs_alloc_path(); @@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); @@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, return -ENOMEM; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.type = BTRFS_DIR_INDEX_KEY; key.offset = objectid; ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); @@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 41422a3de8e..37d164540c3 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); key.objectid = root_objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, } key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(sb, &key, root, NULL); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3efe1c3877b..4d1b50d4dc5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3097,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, for (i = 0; i < nritems; i++) { if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); @@ -6464,7 +6464,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool have_caching_bg = false; WARN_ON(num_bytes < root->sectorsize); - btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -9009,7 +9009,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) root = info->extent_root; key.objectid = 0; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index af0359dcf33..1009fa8a08e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4224,7 +4224,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, WARN_ON(!ret); path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; /* No extents, but there might be delalloc bits */ if (found_key.objectid != btrfs_ino(inode) || diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54c84daec9b..991f056acab 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + file_key.type = BTRFS_EXTENT_DATA_KEY; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, @@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + file_key.type = BTRFS_EXTENT_CSUM_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, goto fail; path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + if (found_key.type != BTRFS_EXTENT_CSUM_KEY) goto fail; csum_offset = (bytenr - found_key.offset) >> @@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, file_key.objectid = objectid; file_key.offset = offset; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + file_key.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); return ret; } @@ -720,7 +720,7 @@ again: bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + file_key.type = BTRFS_EXTENT_CSUM_KEY; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { @@ -790,7 +790,7 @@ again: csum_offset = (bytenr - found_key.offset) >> root->fs_info->sb->s_blocksize_bits; - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { goto insert; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ff1cc0399b9..a9b56e32dd8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, /* get the inode */ key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, } key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 2be38df703c..8ffa4783cbf 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u32 item_size; key.objectid = inode_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name, name_len); path = btrfs_alloc_path(); @@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) @@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) @@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key found_key; ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); - if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY && location->offset == (u64)-1 && path->slots[0] != 0) { slot = path->slots[0] - 1; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid == location->objectid && - btrfs_key_type(&found_key) == btrfs_key_type(location)) { + found_key.type == location->type) { path->slots[0]--; return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 016c403bfe7..e326ffdd5c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, key.objectid = btrfs_ino(inode); key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); path->leave_spinning = 1; @@ -3159,7 +3159,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { @@ -3186,7 +3186,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; - if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ @@ -4085,7 +4085,7 @@ search_again: fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; if (found_key.objectid != ino) break; @@ -5331,7 +5331,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) btrfs_get_delayed_items(inode, &ins_list, &del_list); } - btrfs_set_key_type(&key, key_type); + key.type = key_type; key.offset = ctx->pos; key.objectid = btrfs_ino(inode); @@ -5356,7 +5356,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (found_key.objectid != key.objectid) break; - if (btrfs_key_type(&found_key) != key_type) + if (found_key.type != key_type) break; if (found_key.offset < ctx->pos) goto next; @@ -5568,7 +5568,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) int ret; key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; path = btrfs_alloc_path(); @@ -5600,7 +5600,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != btrfs_ino(inode) || - btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + found_key.type != BTRFS_DIR_INDEX_KEY) { BTRFS_I(inode)->index_cnt = 2; goto out; } @@ -5718,7 +5718,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); key[0].objectid = objectid; - btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].type = BTRFS_INODE_ITEM_KEY; key[0].offset = 0; sizes[0] = sizeof(struct btrfs_inode_item); @@ -5731,7 +5731,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * add more hard links than can fit in the ref item. */ key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].type = BTRFS_INODE_REF_KEY; key[1].offset = ref_objectid; sizes[1] = name_len + sizeof(*ref); @@ -5740,7 +5740,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location = &BTRFS_I(inode)->location; location->objectid = objectid; location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + location->type = BTRFS_INODE_ITEM_KEY; ret = btrfs_insert_inode_locked(inode); if (ret < 0) @@ -5832,7 +5832,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); } else { key.objectid = ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; } @@ -6333,7 +6333,7 @@ again: struct btrfs_file_extent_item); /* are we inside the extent that was found? */ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { /* @@ -8832,7 +8832,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, } key.objectid = btrfs_ino(inode); key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 091c4d35671..b61801ac052 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -535,7 +535,7 @@ static noinline int create_subvol(struct inode *dir, key.objectid = objectid; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); if (ret) @@ -3252,11 +3252,11 @@ process_slot: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + if (key.type > BTRFS_EXTENT_DATA_KEY || key.objectid != btrfs_ino(src)) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + if (key.type == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *extent; int type; u32 size; diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 65793edb38c..47767d5b8f0 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = offset; path = btrfs_alloc_path(); @@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = offset; path = btrfs_alloc_path(); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 9626b4ad3b9..1591620bee3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); - type = btrfs_key_type(&key); + type = key.type; printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " "itemsize %d\n", i, key.objectid, type, key.offset, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f4a41f37be2..053dd000d4e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2714,7 +2714,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (found_key.objectid != scrub_dev->devid) break; - if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) + if (found_key.type != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d296efe2d3e..2f5000c0a87 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1498,7 +1498,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, return -EIO; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -3364,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, @@ -4369,7 +4369,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; key.offset = (u64)-1; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 340a92d08e8..a7a3863e380 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1054,7 +1054,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, if (key.objectid > device->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); @@ -1206,7 +1206,7 @@ again: if (key.objectid > device->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; if (key.offset > search_start) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ad8328d797e..dcf20131fbe 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) * first xattr that we find and walk forward */ key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; path = btrfs_alloc_path(); @@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; - if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + if (found_key.type != BTRFS_XATTR_ITEM_KEY) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); -- cgit v1.2.3-70-g09d2 From 47059d930f0e002ff851beea87d738146804726d Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 3 Jul 2014 18:22:07 +0800 Subject: Btrfs: make defragment work with nodatacow option Btrfs defragment will utilize COW feature, which means this did not work for nodatacow option, this problem was detected by xfstests generic/018 with nodatacow mount option. Fix this problem by forcing cow for a extent with state @EXTETN_DEFRAG setting. Signed-off-by: Wang Shilong Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 6 ++++++ fs/btrfs/inode.c | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 43527fd7882..fd879418fd4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -120,6 +120,12 @@ struct btrfs_inode { */ u64 delalloc_bytes; + /* + * total number of bytes pending defrag, used by stat to check whether + * it needs COW. + */ + u64 defrag_bytes; + /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e326ffdd5c7..2370d72972e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1445,6 +1445,26 @@ error: return ret; } +static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +{ + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + return 0; + + /* + * @defrag_bytes is a hint value, no spinlock held here, + * if is not zero, it means the file is defragging. + * Force cow if given extent needs to be defragged. + */ + if (BTRFS_I(inode)->defrag_bytes && + test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_DEFRAG, 0, NULL)) + return 1; + + return 0; +} + /* * extent_io.c call back to do delayed allocation processing */ @@ -1454,11 +1474,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, { int ret; struct btrfs_root *root = BTRFS_I(inode)->root; + int force_cow = need_force_cow(inode, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!btrfs_test_opt(root, COMPRESS) && @@ -1555,6 +1576,8 @@ static void btrfs_set_bit_hook(struct inode *inode, struct extent_state *state, unsigned long *bits) { + if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC @@ -1577,6 +1600,8 @@ static void btrfs_set_bit_hook(struct inode *inode, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; + if (*bits & EXTENT_DEFRAG) + BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); @@ -1591,6 +1616,13 @@ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, unsigned long *bits) { + u64 len = state->end + 1 - state->start; + + spin_lock(&BTRFS_I(inode)->lock); + if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) + BTRFS_I(inode)->defrag_bytes -= len; + spin_unlock(&BTRFS_I(inode)->lock); + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC @@ -1598,7 +1630,6 @@ static void btrfs_clear_bit_hook(struct inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len = state->end + 1 - state->start; bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { @@ -8173,6 +8204,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->csum_bytes = 0; @@ -8231,6 +8263,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); + WARN_ON(BTRFS_I(inode)->defrag_bytes); /* * This can happen where we create an inode, but somebody else also -- cgit v1.2.3-70-g09d2 From 555e12864063762964433139dee651c5b859a047 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 7 Jul 2014 12:35:21 +0100 Subject: Btrfs: set error return value in btrfs_get_blocks_direct We were returning with 0 (success) because we weren't extracting the error code from em (PTR_ERR(em)). Fix it. Signed-off-by: Filipe Manana Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2370d72972e..177508abb86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7143,8 +7143,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start, len, orig_block_len, ram_bytes, type); - if (IS_ERR(em)) + if (IS_ERR(em)) { + ret = PTR_ERR(em); goto unlock_err; + } } ret = btrfs_add_ordered_extent_dio(inode, start, -- cgit v1.2.3-70-g09d2 From f79707b092caff665a8a77e8e31fe4ab18b4d109 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:09 +0800 Subject: Btrfs: fix wrong skipping compression for an inode If a file's compression ratios is bad, we will set NOCOMPRESS flag for it, and it will skip compression for that inode next time. However, if we remount fs to COMPRESS_FORCE, it still should try if we could compress pages for that inode, this patch fix wrong check for this problem. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 177508abb86..48a2886842b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow, return 0; } +static inline int inode_need_compress(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* force compress */ + if (btrfs_test_opt(root, FORCE_COMPRESS)) + return 1; + /* bad compression ratios */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + return 0; + if (btrfs_test_opt(root, COMPRESS) || + BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || + BTRFS_I(inode)->force_compress) + return 1; + return 0; +} + /* * we create compressed extents in two phases. The first * phase compresses a range of pages that have already been @@ -444,10 +461,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && - (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress) || - (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { + if (inode_need_compress(inode)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); if (!pages) { @@ -1094,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); -- cgit v1.2.3-70-g09d2 From 7816030eb48e00a36ecdc23d484b960922feee67 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:10 +0800 Subject: Btrfs: fall into nocompression codes quickly if possible If flag NOCOMPRESS is set which means bad compression ratio, we could avoid call cow_file_range_async() for this case earlier. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48a2886842b..d1c69beb054 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1488,7 +1488,6 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; int force_cow = need_force_cow(inode, start, end); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { @@ -1497,9 +1496,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { + } else if (!inode_need_compress(inode)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); } else { -- cgit v1.2.3-70-g09d2 From 354877befa852e9b62ddc92a6cc017210e982d46 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:11 +0800 Subject: Btrfs: fix off-by-one in cow_file_range_inline() Btrfs could still inline file data if its size is same as page size, so don't skip max value here. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d1c69beb054..75c6de26405 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, data_len = compressed_size; if (start > 0 || - actual_end >= PAGE_CACHE_SIZE || - data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + actual_end > PAGE_CACHE_SIZE || + data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || (!compressed_size && (actual_end & (root->sectorsize - 1)) == 0) || end + 1 < isize || -- cgit v1.2.3-70-g09d2 From 23ea8e5a07673127d05cb5cf6f9914d7a53e0847 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 12 Sep 2014 18:43:54 +0800 Subject: Btrfs: load checksum data once when submitting a direct read io The current code would load checksum data for several times when we split a whole direct read io because of the limit of the raid stripe, it would make us search the csum tree for several times. In fact, it just wasted time, and made the contention of the csum tree root be more serious. This patch improves this problem by loading the data at once. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 1 - fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent_io.c | 13 +++++++++++-- fs/btrfs/file-item.c | 14 ++------------ fs/btrfs/inode.c | 38 +++++++++++++++++++++----------------- 5 files changed, 35 insertions(+), 34 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index fd879418fd4..8bea70e02a3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -263,7 +263,6 @@ struct btrfs_dio_private { /* dio_bio came from fs/direct-io.c */ struct bio *dio_bio; - u8 csum[0]; }; /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6db3d4bac6f..0f3e4f7e454 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3719,8 +3719,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct btrfs_dio_private *dip, struct bio *bio, - u64 logical_offset); + struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d5e71d4646d..d2f8f39e11f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2621,9 +2621,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) { - return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); -} + struct btrfs_io_bio *btrfs_bio; + struct bio *new; + new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); + if (new) { + btrfs_bio = btrfs_io_bio(new); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return new; +} /* this also allocates from the btrfs_bioset */ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6e6262eca8b..783a94355ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, } int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct btrfs_dio_private *dip, struct bio *bio, - u64 offset) + struct bio *bio, u64 offset) { - int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int ret; - - len >>= inode->i_sb->s_blocksize_bits; - len *= csum_size; - - ret = __btrfs_lookup_bio_sums(root, inode, bio, offset, - (u32 *)(dip->csum + len), 1); - return ret; + return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 75c6de26405..fca944211bf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7240,7 +7240,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; - u32 *csums = (u32 *)dip->csum; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + u32 *csums = (u32 *)io_bio->csum; u64 start; int i; @@ -7282,6 +7283,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) if (err) clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); dio_end_io(dio_bio, err); + + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } @@ -7421,13 +7425,20 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); if (ret) goto err; - } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, + } else { + /* + * We have loaded all the csum data we need when we submit + * the first bio, so skip it. + */ + if (dip->logical_offset != file_offset) + goto map; + + /* Load all csum data at once. */ + ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, file_offset); if (ret) goto err; } - map: ret = btrfs_map_bio(root, rw, bio, 0, async_submit); err: @@ -7448,7 +7459,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, u64 submit_len = 0; u64 map_length; int nr_pages = 0; - int ret = 0; + int ret; int async_submit = 0; map_length = orig_bio->bi_iter.bi_size; @@ -7552,11 +7563,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; struct bio *io_bio; + struct btrfs_io_bio *btrfs_bio; int skip_sum; - int sum_len; int write = rw & REQ_WRITE; int ret = 0; - u16 csum_size; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -7566,16 +7576,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, goto free_ordered; } - if (!skip_sum && !write) { - csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_iter.bi_size >> - inode->i_sb->s_blocksize_bits; - sum_len *= csum_size; - } else { - sum_len = 0; - } - - dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); + dip = kmalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; goto free_io_bio; @@ -7601,6 +7602,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (!ret) return; + btrfs_bio = btrfs_io_bio(io_bio); + if (btrfs_bio->end_io) + btrfs_bio->end_io(btrfs_bio, ret); free_io_bio: bio_put(io_bio); -- cgit v1.2.3-70-g09d2 From dc380aea5fa4636fc498a351eb720943bc644451 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 12 Sep 2014 18:43:55 +0800 Subject: Btrfs: cleanup similar code of the buffered data data check and dio read data check Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 102 +++++++++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 55 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fca944211bf..70eaae12718 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2899,6 +2899,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return 0; } +static int __readpage_endio_check(struct inode *inode, + struct btrfs_io_bio *io_bio, + int icsum, struct page *page, + int pgoff, u64 start, size_t len) +{ + char *kaddr; + u32 csum_expected; + u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + csum_expected = *(((u32 *)io_bio->csum) + icsum); + + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr + pgoff, csum, len); + btrfs_csum_final(csum, (char *)&csum); + if (csum != csum_expected) + goto zeroit; + + kunmap_atomic(kaddr); + return 0; +zeroit: + if (__ratelimit(&_rs)) + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, csum_expected); + memset(kaddr + pgoff, 1, len); + flush_dcache_page(page); + kunmap_atomic(kaddr); + if (csum_expected == 0) + return 0; + return -EIO; +} + /* * when reads are done, we need to check csums to verify the data is correct * if there's a match, we allow the bio to finish. If not, the code in @@ -2911,20 +2945,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - char *kaddr; struct btrfs_root *root = BTRFS_I(inode)->root; - u32 csum_expected; - u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); if (PageChecked(page)) { ClearPageChecked(page); - goto good; + return 0; } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - goto good; + return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { @@ -2934,28 +2963,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, } phy_offset >>= inode->i_sb->s_blocksize_bits; - csum_expected = *(((u32 *)io_bio->csum) + phy_offset); - - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); - btrfs_csum_final(csum, (char *)&csum); - if (csum != csum_expected) - goto zeroit; - - kunmap_atomic(kaddr); -good: - return 0; - -zeroit: - if (__ratelimit(&_rs)) - btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(page->mapping->host), start, csum, csum_expected); - memset(kaddr + offset, 1, end - start + 1); - flush_dcache_page(page); - kunmap_atomic(kaddr); - if (csum_expected == 0) - return 0; - return -EIO; + return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, + start, (size_t)(end - start + 1)); } struct delayed_iput { @@ -7238,41 +7247,24 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec; struct inode *inode = dip->inode; - struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - u32 *csums = (u32 *)io_bio->csum; u64 start; + int ret; int i; + if (err || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + goto skip_checksum; + start = dip->logical_offset; bio_for_each_segment_all(bvec, bio, i) { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - struct page *page = bvec->bv_page; - char *kaddr; - u32 csum = ~(u32)0; - unsigned long flags; - - local_irq_save(flags); - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + bvec->bv_offset, - csum, bvec->bv_len); - btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr); - local_irq_restore(flags); - - flush_dcache_page(bvec->bv_page); - if (csum != csums[i]) { - btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(inode), start, csum, - csums[i]); - err = -EIO; - } - } - + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + 0, start, bvec->bv_len); + if (ret) + err = -EIO; start += bvec->bv_len; } - +skip_checksum: unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); dio_bio = dip->dio_bio; -- cgit v1.2.3-70-g09d2 From c1dc08967f69c6b5067f8302c600f6628123f3bf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 12 Sep 2014 18:43:56 +0800 Subject: Btrfs: do file data check by sub-bio's self Direct IO splits the original bio to several sub-bios because of the limit of raid stripe, and the filesystem will wait for all sub-bios and then run final end io process. But it was very hard to implement the data repair when dio read failure happens, because at the final end io function, we didn't know which mirror the data was read from. So in order to implement the data repair, we have to move the file data check in the final end io function to the sub-bio end io function, in which we can get the mirror number of the device we access. This patch did this work as the first step of the direct io data repair implementation. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 9 +++++ fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 100 ++++++++++++++++++++++++++++++++++++------------- fs/btrfs/volumes.h | 5 ++- 4 files changed, 87 insertions(+), 29 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8bea70e02a3..4d309471294 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -245,8 +245,11 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 + struct btrfs_dio_private { struct inode *inode; + unsigned long flags; u64 logical_offset; u64 disk_bytenr; u64 bytes; @@ -263,6 +266,12 @@ struct btrfs_dio_private { /* dio_bio came from fs/direct-io.c */ struct bio *dio_bio; + + /* + * The original bio may be splited to several sub-bios, this is + * done during endio of sub-bios + */ + int (*subio_endio)(struct inode *, struct btrfs_io_bio *); }; /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2f8f39e11f..ad04f85ac45 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2472,7 +2472,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 70eaae12718..09d8c5ee886 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7242,29 +7242,40 @@ unlock_err: return ret; } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static int btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio) { - struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec; - struct inode *inode = dip->inode; - struct bio *dio_bio; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); u64 start; - int ret; int i; + int ret; + int err = 0; - if (err || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto skip_checksum; + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return 0; - start = dip->logical_offset; - bio_for_each_segment_all(bvec, bio, i) { + start = io_bio->logical; + bio_for_each_segment_all(bvec, &io_bio->bio, i) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 0, start, bvec->bv_len); if (ret) err = -EIO; start += bvec->bv_len; } -skip_checksum: + + return err; +} + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct bio *dio_bio; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + + if (!err && (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)) + err = btrfs_subio_endio_read(inode, io_bio); + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); dio_bio = dip->dio_bio; @@ -7342,6 +7353,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, static void btrfs_end_dio_bio(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; + int ret; if (err) { btrfs_err(BTRFS_I(dip->inode)->root->fs_info, @@ -7349,6 +7361,13 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) btrfs_ino(dip->inode), bio->bi_rw, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); + } else if (dip->subio_endio) { + ret = dip->subio_endio(dip->inode, btrfs_io_bio(bio)); + if (ret) + err = ret; + } + + if (err) { dip->errors = 1; /* @@ -7379,6 +7398,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); } +static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, + struct inode *inode, + struct btrfs_dio_private *dip, + struct bio *bio, + u64 file_offset) +{ + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + int ret; + + /* + * We load all the csum data we need when we submit + * the first bio to reduce the csum tree search and + * contention. + */ + if (dip->logical_offset == file_offset) { + ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, + file_offset); + if (ret) + return ret; + } + + if (bio == dip->orig_bio) + return 0; + + file_offset -= dip->logical_offset; + file_offset >>= inode->i_sb->s_blocksize_bits; + io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + + return 0; +} + static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, int async_submit) @@ -7418,16 +7469,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (ret) goto err; } else { - /* - * We have loaded all the csum data we need when we submit - * the first bio, so skip it. - */ - if (dip->logical_offset != file_offset) - goto map; - - /* Load all csum data at once. */ - ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, - file_offset); + ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, + file_offset); if (ret) goto err; } @@ -7462,6 +7505,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; + dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } @@ -7478,6 +7522,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { @@ -7512,6 +7557,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, goto out_err; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, @@ -7568,7 +7614,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, goto free_ordered; } - dip = kmalloc(sizeof(*dip), GFP_NOFS); + dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; goto free_io_bio; @@ -7580,21 +7626,23 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; - dip->errors = 0; dip->orig_bio = io_bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); + btrfs_bio = btrfs_io_bio(io_bio); + btrfs_bio->logical = file_offset; - if (write) + if (write) { io_bio->bi_end_io = btrfs_endio_direct_write; - else + } else { io_bio->bi_end_io = btrfs_endio_direct_read; + dip->subio_endio = btrfs_subio_endio_read; + } ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; - btrfs_bio = btrfs_io_bio(io_bio); if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); free_io_bio: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2b37da3dd40..91998bc0b4c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,8 +268,9 @@ struct btrfs_fs_devices { */ typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { - unsigned long mirror_num; - unsigned long stripe_index; + unsigned int mirror_num; + unsigned int stripe_index; + u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; -- cgit v1.2.3-70-g09d2 From 8b110e393c5a6e72d50fcdf9fa7ed8b647cfdfc9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 12 Sep 2014 18:44:03 +0800 Subject: Btrfs: implement repair function when direct read fails This patch implement data repair function when direct read fails. The detail of the implementation is: - When we find the data is not right, we try to read the data from the other mirror. - When the io on the mirror ends, we will insert the endio work into the dedicated btrfs workqueue, not common read endio workqueue, because the original endio work is still blocked in the btrfs endio workqueue, if we insert the endio work of the io on the mirror into that workqueue, deadlock would happen. - After we get right data, we write it back to the corrupted mirror. - And if the data on the new mirror is still corrupted, we will try next mirror until we read right data or all the mirrors are traversed. - After the above work, we set the uptodate flag according to the result. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 1 + fs/btrfs/async-thread.h | 1 + fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 11 +- fs/btrfs/disk-io.h | 1 + fs/btrfs/extent_io.c | 12 ++- fs/btrfs/extent_io.h | 5 +- fs/btrfs/inode.c | 276 ++++++++++++++++++++++++++++++++++++++++++++---- 9 files changed, 281 insertions(+), 29 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index fbd76ded9a3..2da0a66790b 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper); BTRFS_WORK_HELPER(endio_meta_helper); BTRFS_WORK_HELPER(endio_meta_write_helper); BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(endio_repair_helper); BTRFS_WORK_HELPER(rmw_helper); BTRFS_WORK_HELPER(endio_write_helper); BTRFS_WORK_HELPER(freespace_write_helper); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e9e31c94758..e386c29ef1f 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper); BTRFS_WORK_HELPER_PROTO(endio_meta_helper); BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(endio_repair_helper); BTRFS_WORK_HELPER_PROTO(rmw_helper); BTRFS_WORK_HELPER_PROTO(endio_write_helper); BTRFS_WORK_HELPER_PROTO(freespace_write_helper); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4d309471294..7a7521c87c8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -271,7 +271,7 @@ struct btrfs_dio_private { * The original bio may be splited to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *); + int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); }; /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0f3e4f7e454..51ff3f8dbab 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1538,6 +1538,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *endio_repair_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a224fb9b34a..48794f95142 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -713,7 +713,11 @@ static void end_workqueue_bio(struct bio *bio, int err) func = btrfs_endio_write_helper; } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + if (unlikely(end_io_wq->metadata == + BTRFS_WQ_ENDIO_DIO_REPAIR)) { + wq = fs_info->endio_repair_workers; + func = btrfs_endio_repair_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { wq = fs_info->endio_raid56_workers; func = btrfs_endio_raid56_helper; } else if (end_io_wq->metadata) { @@ -741,6 +745,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) { struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); if (!end_io_wq) return -ENOMEM; @@ -2055,6 +2060,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); @@ -2572,6 +2578,8 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue("rmw", flags, max_active, 2); fs_info->endio_write_workers = @@ -2593,6 +2601,7 @@ int open_ctree(struct super_block *sb, fs_info->submit_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 52a17db700f..14d06ee1e14 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -30,6 +30,7 @@ enum { BTRFS_WQ_ENDIO_METADATA = 1, BTRFS_WQ_ENDIO_FREE_SPACE = 2, BTRFS_WQ_ENDIO_RAID56 = 3, + BTRFS_WQ_ENDIO_DIO_REPAIR = 4, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05533c99f89..9e2ef27672e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1962,7 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -static int free_io_failure(struct inode *inode, struct io_failure_record *rec) +int free_io_failure(struct inode *inode, struct io_failure_record *rec) { int ret; int err = 0; @@ -2081,8 +2081,8 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -static int clean_io_failure(struct inode *inode, u64 start, - struct page *page, unsigned int pg_offset) +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset) { u64 private; u64 private_failure; @@ -2291,7 +2291,7 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func) + bio_end_io_t *endio_func, void *data) { struct bio *bio; struct btrfs_io_bio *btrfs_failed_bio; @@ -2305,6 +2305,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_iter.bi_size = 0; + bio->bi_private = data; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2362,7 +2363,8 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, phy_offset >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, start - page_offset(page), - (int)phy_offset, failed_bio->bi_end_io); + (int)phy_offset, failed_bio->bi_end_io, + NULL); if (!bio) { free_io_failure(inode, failrec); return -EIO; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bf0597f3a9e..176a4b1ed52 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -341,6 +341,8 @@ struct btrfs_fs_info; int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); @@ -371,7 +373,8 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func); + bio_end_io_t *endio_func, void *data); +int free_io_failure(struct inode *inode, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09d8c5ee886..c3c3269a9e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7242,30 +7242,267 @@ unlock_err: return ret; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio) +static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int rw, int mirror_num) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + BUG_ON(rw & REQ_WRITE); + + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DIO_REPAIR); + if (ret) + goto err; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); +err: + bio_put(bio); + return ret; +} + +static int btrfs_check_dio_repairable(struct inode *inode, + struct bio *failed_bio, + struct io_failure_record *failrec, + int failed_mirror) +{ + int num_copies; + + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + + if (failrec->this_mirror > num_copies) { + pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + return 1; +} + +static int dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int failed_mirror, bio_end_io_t *repair_endio, + void *repair_arg) +{ + struct io_failure_record *failrec; + struct bio *bio; + int isector; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, + failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + isector = start - btrfs_io_bio(failed_bio)->logical; + isector >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + 0, isector, repair_endio, repair_arg); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } + + btrfs_debug(BTRFS_I(inode)->root->fs_info, + "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); + + ret = submit_dio_repair_bio(inode, bio, read_mode, + failrec->this_mirror); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + + return ret; +} + +struct btrfs_retry_complete { + struct completion done; + struct inode *inode; + u64 start; + int uptodate; +}; + +static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (err) + goto end; + + done->uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) + clean_io_failure(done->inode, done->start, bvec->bv_page, 0); +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) { struct bio_vec *bvec; + struct btrfs_retry_complete done; u64 start; int i; int ret; - int err = 0; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); + if (ret) + return ret; + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } + + start += bvec->bv_len; + } + + return 0; +} + +static void btrfs_retry_endio(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct bio_vec *bvec; + int uptodate; + int ret; + int i; + + if (err) + goto end; + + uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) { + ret = __readpage_endio_check(done->inode, io_bio, i, + bvec->bv_page, 0, + done->start, bvec->bv_len); + if (!ret) + clean_io_failure(done->inode, done->start, + bvec->bv_page, 0); + else + uptodate = 0; + } + + done->uptodate = uptodate; +end: + complete(&done->done); + bio_put(bio); +} +static int __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; + u64 start; + u64 offset = 0; + int i; + int ret; + + err = 0; start = io_bio->logical; + done.inode = inode; + bio_for_each_segment_all(bvec, &io_bio->bio, i) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 0, start, bvec->bv_len); - if (ret) - err = -EIO; + if (likely(!ret)) + goto next; +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); + if (ret) { + err = ret; + goto next; + } + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } +next: + offset += bvec->bv_len; start += bvec->bv_len; } return err; } +static int btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (skip_csum) { + if (unlikely(err)) + return __btrfs_correct_data_nocsum(inode, io_bio); + else + return 0; + } else { + return __btrfs_subio_endio_read(inode, io_bio, err); + } +} + static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; @@ -7273,8 +7510,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - if (!err && (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)) - err = btrfs_subio_endio_read(inode, io_bio); + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + err = btrfs_subio_endio_read(inode, io_bio, err); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -7353,19 +7590,16 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, static void btrfs_end_dio_bio(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - int ret; - if (err) { - btrfs_err(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - } else if (dip->subio_endio) { - ret = dip->subio_endio(dip->inode, btrfs_io_bio(bio)); - if (ret) - err = ret; - } + if (err) + btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); + + if (dip->subio_endio) + err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); if (err) { dip->errors = 1; -- cgit v1.2.3-70-g09d2 From f612496bca664bff6a09a99a9a7506410b6e876e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 12 Sep 2014 18:44:04 +0800 Subject: Btrfs: cleanup the read failure record after write or when the inode is freeing After the data is written successfully, we should cleanup the read failure record in that range because - If we set data COW for the file, the range that the failure record pointed to is mapped to a new place, so it is invalid. - If we set no data COW for the file, and if there is no error during writting, the corrupted data is corrected, so the failure record can be removed. And if some errors happen on the mirrors, we also needn't worry about it because the failure record will be recreated if we read the same place again. Sometimes, we may fail to correct the data, so the failure records will be left in the tree, we need free them when we free the inode or the memory leak happens. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 1 + fs/btrfs/inode.c | 6 ++++++ 3 files changed, 41 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9e2ef27672e..78229007f99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2138,6 +2138,40 @@ out: return 0; } +/* + * Can be called when + * - hold extent lock + * - under ordered extent + * - the inode is freeing + */ +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) +{ + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct io_failure_record *failrec; + struct extent_state *state, *next; + + if (RB_EMPTY_ROOT(&failure_tree->state)) + return; + + spin_lock(&failure_tree->lock); + state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); + while (state) { + if (state->start > end) + break; + + ASSERT(state->end <= end); + + next = next_state(state); + + failrec = (struct io_failure_record *)state->private; + free_extent_state(state); + kfree(failrec); + + state = next; + } + spin_unlock(&failure_tree->lock); +} + int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 176a4b1ed52..5e91fb9d176 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -366,6 +366,7 @@ struct io_failure_record { int in_validation; }; +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end); int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret); int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c3c3269a9e0..3c16a1493e2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2703,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + btrfs_free_io_failure_record(inode, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -4799,6 +4803,8 @@ void btrfs_evict_inode(struct inode *inode) /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ btrfs_wait_ordered_range(inode, 0, (u64)-1); + btrfs_free_io_failure_record(inode, 0, (u64)-1); + if (root->fs_info->log_root_recovering) { BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)); -- cgit v1.2.3-70-g09d2 From e6c4efd87ab04e5ead363f24e6ac35ed3506d401 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 17 Sep 2014 11:53:35 +0800 Subject: btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map The following commit enhanced the merge_extent_mapping() to reduce fragment in extent map tree, but it can't handle case which existing lies before map_start: 51f39 btrfs: Use right extent length when inserting overlap extent map. [BUG] When existing extent map's start is before map_start, the em->len will be minus, which will corrupt the extent map and fail to insert the new extent map. This will happen when someone get a large extent map, but when it is going to insert it into extent map tree, some one has already commit some write and split the huge extent into small parts. [REPRODUCER] It is very easy to tiger using filebench with randomrw personality. It is about 100% to reproduce when using 8G preallocated file in 60s randonrw test. [FIX] This patch can now handle any existing extent position. Since it does not directly use existing->start, now it will find the previous and next extent around map_start. So the old existing->start < map_start bug will never happen again. [ENHANCE] This patch will insert the best fitted extent map into extent map tree, other than the oldest [map_start, map_start + sectorsize) or the relatively newer but not perfect [map_start, existing->start). The patch will first search existing extent that does not intersects with the desired map range [map_start, map_start + len). The existing extent will be either before or behind map_start, and based on the existing extent, we can find out the previous and next extent around map_start. So the best fitted extent would be [prev->end, next->start). For prev or next is not found, em->start would be prev->end and em->end wold be next->start. With this patch, the fragment in extent map tree should be reduced much more than the 51f39 commit and reduce an unneeded extent map tree search. Reported-by: Tsutomu Itoh Signed-off-by: Qu Wenruo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 79 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3c16a1493e2..b1106d0dcd5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6249,21 +6249,60 @@ out_fail_inode: goto out_fail; } +/* Find next extent map of a given extent map, caller needs to ensure locks */ +static struct extent_map *next_extent_map(struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); +} + /* helper for btfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert - * the new extent into the tree. + * the best fitted new extent into the tree. */ static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, u64 map_start) { + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; u64 start_diff; BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); - start_diff = map_start - em->start; - em->start = map_start; - em->len = existing->start - em->start; + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; if (em->block_start < EXTENT_MAP_LAST_BYTE && !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; @@ -6540,25 +6579,21 @@ insert: ret = 0; - existing = lookup_extent_mapping(em_tree, start, len); - if (existing && (existing->start > start || - existing->start + existing->len <= start)) { + existing = search_extent_mapping(em_tree, start, len); + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= extent_map_end(existing) || + start + len <= existing->start) { + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + err = merge_extent_mapping(em_tree, existing, + em, start); free_extent_map(existing); - existing = NULL; - } - if (!existing) { - existing = lookup_extent_mapping(em_tree, em->start, - em->len); - if (existing) { - err = merge_extent_mapping(em_tree, existing, - em, start); - free_extent_map(existing); - if (err) { - free_extent_map(em); - em = NULL; - } - } else { - err = -EIO; + if (err) { free_extent_map(em); em = NULL; } -- cgit v1.2.3-70-g09d2 From 1d52c78afbbf80b58299e076a159617d6b42fe3c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 18 Sep 2014 11:30:44 -0400 Subject: Btrfs: try not to ENOSPC on log replay When doing log replay we may have to update inodes, which traditionally goes through our delayed inode stuff. This will try to move space over from the trans handle, but we don't reserve space in our trans handle on replay since we don't know how much we will need, so instead we try to flush. But because we have a trans handle open we won't flush anything, so if we are out of reserve space we will simply return ENOSPC. Since we know that if an operation made it into the log then we definitely had space before the box bought the farm then we don't need to worry about doing this space reservation. Use the fs_info->log_root_recovering flag to skip the delayed inode stuff and update the item directly. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1106d0dcd5..344a322eb38 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3718,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !root->fs_info->log_root_recovering) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, root, inode); -- cgit v1.2.3-70-g09d2 From 5d99a998f375b7bff7ddff0162a6eed4d4ca1318 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 29 Sep 2014 19:20:37 +0200 Subject: btrfs: remove unlikely from NULL checks Unlikely is implicit for NULL checks of pointers. Signed-off-by: David Sterba --- fs/btrfs/async-thread.c | 10 +++++----- fs/btrfs/inode.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 2da0a66790b..4dabeb893b7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -92,7 +92,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active, { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - if (unlikely(!ret)) + if (!ret) return NULL; ret->max_active = max_active; @@ -116,7 +116,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active, ret->normal_wq = alloc_workqueue("%s-%s", flags, ret->max_active, "btrfs", name); - if (unlikely(!ret->normal_wq)) { + if (!ret->normal_wq) { kfree(ret); return NULL; } @@ -138,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, { struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - if (unlikely(!ret)) + if (!ret) return NULL; ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, max_active, thresh); - if (unlikely(!ret->normal)) { + if (!ret->normal) { kfree(ret); return NULL; } @@ -151,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (flags & WQ_HIGHPRI) { ret->high = __btrfs_alloc_workqueue(name, flags, max_active, thresh); - if (unlikely(!ret->high)) { + if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); return NULL; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 344a322eb38..998e67fdf2f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9013,7 +9013,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (unlikely(!work)) { + if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); else -- cgit v1.2.3-70-g09d2 From ee39b432b4ac083acdafd7b4f156283722e3bf14 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Sep 2014 01:33:33 +0200 Subject: btrfs: remove unlikely from data-dependent branches and slow paths There are the branch hints that obviously depend on the data being processed, the CPU predictor will do better job according to the actual load. It also does not make sense to use the hints in slow paths that do a lot of other operations like locking, waiting or IO. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/transaction.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 44d04979f07..ede740bfaac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9694,7 +9694,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) int btrfs_start_nocow_write(struct btrfs_root *root) { - if (unlikely(atomic_read(&root->will_be_snapshoted))) + if (atomic_read(&root->will_be_snapshoted)) return 0; percpu_counter_inc(&root->subv_writers->counter); @@ -9702,7 +9702,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) * Make sure counter is updated before we check for snapshot creation. */ smp_mb(); - if (unlikely(atomic_read(&root->will_be_snapshoted))) { + if (atomic_read(&root->will_be_snapshoted)) { btrfs_end_nocow_write(root); return 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29b147d46b0..a18ceabd99a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, if (unlikely(copied == 0)) break; - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + if (copied < PAGE_CACHE_SIZE - offset) { offset += copied; } else { pg++; @@ -1792,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (unlikely(file->f_flags & O_DIRECT)) { + if (file->f_flags & O_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 998e67fdf2f..47d21456083 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7802,9 +7802,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - if (unlikely(map_length < submit_len + bvec->bv_len || + if (map_length < submit_len + bvec->bv_len || bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len)) { + bvec->bv_offset) < bvec->bv_len) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -8017,8 +8017,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; - } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags))) { + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0ff212757b9..f2c60cd70e6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3167,7 +3167,7 @@ static void clone_update_extent_map(struct inode *inode, em->start + em->len - 1, 0); } - if (unlikely(ret)) + if (ret) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 16d0c1b62b3..8eded14e8c5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, /* * Do the reservation for the relocation root creation */ - if (unlikely(need_reserve_reloc_root(root))) { + if (need_reserve_reloc_root(root)) { num_bytes += root->nodesize; reloc_reserved = true; } -- cgit v1.2.3-70-g09d2 From bfebd8b5441755f228ad02273682d675d3335123 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 30 Jul 2014 00:25:45 +0200 Subject: btrfs: use enum for wq endio metadata type The enum exists but is not consistently used. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 11 +++++++---- fs/btrfs/disk-io.c | 14 +++----------- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/inode.c | 3 ++- 4 files changed, 14 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index eeee13842cd..d3220d31d3c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -388,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * freed before we're done setting it up */ atomic_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -419,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -668,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_CACHE_SIZE) { bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ /* @@ -706,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b2a741370b..d7cb58ed294 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -82,7 +82,7 @@ struct end_io_wq { void *private; struct btrfs_fs_info *info; int error; - int metadata; + enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; }; @@ -733,16 +733,8 @@ static void end_workqueue_bio(struct bio *bio, int err) btrfs_queue_work(wq, &end_io_wq->work); } -/* - * For the metadata arg you want - * - * 0 - if data - * 1 - if normal metadta - * 2 - if writing to the free space cache area - * 3 - raid parity work - */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata) + enum btrfs_wq_endio_type metadata) { struct end_io_wq *end_io_wq; @@ -930,7 +922,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, * can happen in the async kernel threads */ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); + bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 14d06ee1e14..84da438fd9a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,7 +25,7 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 -enum { +enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA = 0, BTRFS_WQ_ENDIO_METADATA = 1, BTRFS_WQ_ENDIO_FREE_SPACE = 2, @@ -120,7 +120,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata); + enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 344a322eb38..b1e388dea7b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7721,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, bio_get(bio); if (!write) { - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); if (ret) goto err; } -- cgit v1.2.3-70-g09d2 From 32be3a1ac6d09576c57063c6c350ca36eaebdbd3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 22 Sep 2014 09:13:03 +0800 Subject: btrfs: Fix the wrong condition judgment about subset extent map Previous commit: btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map is using wrong condition to judgement whether the range is a subset of a existing extent map. This may cause bug in btrfs no-holes mode. This patch will correct the judgment and fix the bug. Signed-off-by: Qu Wenruo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 344a322eb38..ef6fc587d96 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6586,7 +6586,7 @@ insert: * extent causing the -EEXIST. */ if (start >= extent_map_end(existing) || - start + len <= existing->start) { + start <= existing->start) { /* * The existing extent map is the one nearest to * the [start, start + len) range which overlaps -- cgit v1.2.3-70-g09d2 From d37973082b453ba6b89ec07eb7b84305895d35e1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 15 Oct 2014 13:50:56 -0700 Subject: Revert "Btrfs: race free update of commit root for ro snapshots" This reverts commit 9c3b306e1c9e6be4be09e99a8fe2227d1005effc. Switching only one commit root during a transaction is wrong because it leads the fs into an inconsistent state. All commit roots should be switched at once, at transaction commit time, otherwise backref walking can often miss important references that were only accessible through the old commit root. Plus, the root item for the snapshot's root wasn't getting updated and preventing the next transaction commit to do it. This made several users get into random corruption issues after creation of readonly snapshots. A regression test for xfstests will follow soon. Cc: stable@vger.kernel.org # 3.17 Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 ------------------------------------ fs/btrfs/ioctl.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 36 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fc9c0439caa..d23362f4464 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5261,42 +5261,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } - /* - * If orphan cleanup did remove any orphans, it means the tree - * was modified and therefore the commit root is not the same as - * the current root anymore. This is a problem, because send - * uses the commit root and therefore can see inode items that - * don't exist in the current root anymore, and for example make - * calls to btrfs_iget, which will do tree lookups based on the - * current root and not on the commit root. Those lookups will - * fail, returning a -ESTALE error, and making send fail with - * that error. So make sure a send does not see any orphans we - * have just removed, and that it will see the same inodes - * regardless of whether a transaction commit happened before - * it started (meaning that the commit root will be the same as - * the current root) or not. - */ - if (sub_root->node != sub_root->commit_root) { - u64 sub_flags = btrfs_root_flags(&sub_root->root_item); - - if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) { - struct extent_buffer *eb; - - /* - * Assert we can't have races between dentry - * lookup called through the snapshot creation - * ioctl and the VFS. - */ - ASSERT(mutex_is_locked(&dir->i_mutex)); - - down_write(&root->fs_info->commit_root_sem); - eb = sub_root->commit_root; - sub_root->commit_root = - btrfs_root_node(sub_root); - up_write(&root->fs_info->commit_root_sem); - free_extent_buffer(eb); - } - } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e732274f1af..33c80f560f9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -713,6 +713,39 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; + + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); -- cgit v1.2.3-70-g09d2