From 1fa11e265fa2562fb713171b6a58e72bb7afd276 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 6 Aug 2012 14:18:51 -0600 Subject: Btrfs: fix deadlock in wait_for_more_refs Commit a168650c introduced a waiting mechanism to prevent busy waiting in btrfs_run_delayed_refs. This can deadlock with btrfs_run_ordered_operations, where a tree_mod_seq is held while waiting for the io to complete, while the end_io calls btrfs_run_delayed_refs. This whole mechanism is unnecessary. If not enough runnable refs are available to satisfy count, just return as count is more like a guideline than a strict requirement. In case we have to run all refs, commit transaction makes sure that no other threads are working in the transaction anymore, so we just assert here that no refs are blocked. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 502b20c56e8..a7ad8fc8dc5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2035,8 +2035,6 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; - init_waitqueue_head(&fs_info->tree_mod_seq_wait); - /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); -- cgit v1.2.3-70-g09d2 From 66657b318e0e443ada229fccd40c8be86cfebdbf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 1 Aug 2012 15:36:24 -0400 Subject: Btrfs: barrier before waitqueue_active We need a barrir before calling waitqueue_active otherwise we will miss wakeups. So in places that do atomic_dec(); then atomic_read() use atomic_dec_return() which imply a memory barrier (see memory-barriers.txt) and then add an explicit memory barrier everywhere else that need them. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/compression.c | 1 + fs/btrfs/delayed-inode.c | 7 +++---- fs/btrfs/disk-io.c | 7 ++++--- fs/btrfs/inode.c | 4 +--- fs/btrfs/volumes.c | 3 +-- 5 files changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 86eff48dab7..43d1c5a3a03 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -818,6 +818,7 @@ static void free_workspace(int type, struct list_head *workspace) btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(alloc_workspace); wake: + smp_mb(); if (waitqueue_active(workspace_wait)) wake_up(workspace_wait); } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 00deed4ef3e..07d5eeb1e6f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -512,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && + if (atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND && waitqueue_active(&delayed_root->wait)) wake_up(&delayed_root->wait); } @@ -1056,8 +1056,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < + if (atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && waitqueue_active(&delayed_root->wait)) wake_up(&delayed_root->wait); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7ad8fc8dc5..dd86a5d8842 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -754,9 +754,7 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - atomic_dec(&fs_info->nr_async_submits); - - if (atomic_read(&fs_info->nr_async_submits) < limit && + if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -3783,14 +3781,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) /* FIXME: cleanup wait for commit */ t->in_commit = 1; t->blocked = 1; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); t->blocked = 0; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); t->commit_done = 1; + smp_mb(); if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2d65c52b094..97baf00b40d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1007,9 +1007,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; - atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); - - if (atomic_read(&root->fs_info->async_delalloc_pages) < + if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b394503bd4..0b1e69d380d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -227,9 +227,8 @@ loop_lock: cur = pending; pending = pending->bi_next; cur->bi_next = NULL; - atomic_dec(&fs_info->nr_async_bios); - if (atomic_read(&fs_info->nr_async_bios) < limit && + if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); -- cgit v1.2.3-70-g09d2 From 68ce9682a4bb95d6be5529cb57214bf2a1b7d20e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 1 Aug 2012 05:45:52 -0600 Subject: Btrfs: remove superblock writing after fatal error With commit acce952b0, btrfs was changed to flag the filesystem with BTRFS_SUPER_FLAG_ERROR and switch to read-only mode after a fatal error happened like a write I/O errors of all mirrors. In such situations, on unmount, the superblock is written in btrfs_error_commit_super(). This is done with the intention to be able to evaluate the error flag on the next mount. A warning is printed in this case during the next mount and the log tree is ignored. The issue is that it is possible that the superblock points to a root that was not written (due to write I/O errors). The result is that the filesystem cannot be mounted. btrfsck also does not start and all the other btrfs-progs tools fail to start as well. However, mount -o recovery is working well and does the right things to recover the filesystem (i.e., don't use the log root, clear the free space cache and use the next mountable root that is stored in the root backup array). This patch removes the writing of the superblock when BTRFS_SUPER_FLAG_ERROR is set, and removes the handling of the error flag in the mount function. These lines can be used to reproduce the issue (using /dev/sdm): SCRATCH_DEV=/dev/sdm SCRATCH_MNT=/mnt echo 0 25165824 linear $SCRATCH_DEV 0 | dmsetup create foo ls -alLF /dev/mapper/foo mkfs.btrfs /dev/mapper/foo mount /dev/mapper/foo $SCRATCH_MNT echo bar > $SCRATCH_MNT/foo sync echo 0 25165824 error | dmsetup reload foo dmsetup resume foo ls -alF $SCRATCH_MNT touch $SCRATCH_MNT/1 ls -alF $SCRATCH_MNT sleep 35 echo 0 25165824 linear $SCRATCH_DEV 0 | dmsetup reload foo dmsetup resume foo sleep 1 umount $SCRATCH_MNT btrfsck /dev/mapper/foo dmsetup remove foo Signed-off-by: Stefan Behrens Signed-off-by: Jan Schmidt --- fs/btrfs/disk-io.c | 36 ++++-------------------------------- fs/btrfs/disk-io.h | 2 +- 2 files changed, 5 insertions(+), 33 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dd86a5d8842..3c4c4397f47 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2527,8 +2527,7 @@ retry_root_backup: goto fail_trans_kthread; /* do not make disk changes in broken FS */ - if (btrfs_super_log_root(disk_super) != 0 && - !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -3188,30 +3187,14 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(fs_info); - /* - * Here come 2 situations when btrfs is broken to flip readonly: - * - * 1. when btrfs flips readonly somewhere else before - * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, - * and btrfs will skip to write sb directly to keep - * ERROR state on disk. - * - * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannot write sb via btrfs_commit_super, - * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, - * btrfs will cleanup all FS resources first and write sb then. - */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - ret = btrfs_error_commit_super(root); - if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); - } + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3433,18 +3416,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (read_only) return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - printk(KERN_WARNING "warning: mount fs with errors, " - "running btrfsck is recommended\n"); - } - return 0; } -int btrfs_error_commit_super(struct btrfs_root *root) +void btrfs_error_commit_super(struct btrfs_root *root) { - int ret; - mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -3454,10 +3430,6 @@ int btrfs_error_commit_super(struct btrfs_root *root) /* cleanup FS via transaction */ btrfs_cleanup_transaction(root); - - ret = write_ctree_super(NULL, root, 0); - - return ret; } static void btrfs_destroy_ordered_operations(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 95e147eea23..c5b00a735fe 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -int btrfs_error_commit_super(struct btrfs_root *root); +void btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, -- cgit v1.2.3-70-g09d2 From 256dd1bb3750ac5ad49b40887c1691788dc44b33 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 10 Aug 2012 08:58:21 -0600 Subject: Btrfs: fix that repair code is spuriously executed for transid failures If verify_parent_transid() fails for all mirrors, the current code calls repair_io_failure() anyway which means: - that the disk block is rewritten without repairing anything and - that a kernel log message is printed which misleadingly claims that a read error was corrected. This is an example: parent transid verify failed on 615015833600 wanted 110423 found 110424 parent transid verify failed on 615015833600 wanted 110423 found 110424 btrfs read error corrected: ino 1 off 615015833600 (dev /dev/...) It is wrong to ignore the results from verify_parent_transid() and to call repair_eb_io_failure() when the verification of the transids failed. This commit fixes the issue. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3c4c4397f47..29c69e60d3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -377,9 +377,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, + if (!ret) { + if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; + break; + else + ret = -EIO; + } /* * This buffer's crc is fine, but its contents are corrupted, so -- cgit v1.2.3-70-g09d2