From 74b2107543da4ed9607ec484f63c42362dc9fca6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 12:02:53 -0400 Subject: Btrfs: make sure to use the delalloc reserve when filling delalloc In the prealloc filling code and compressed code we don't set trans->block_rsv to the delalloc block reserve properly, which is going to make us use metadata from the wrong pool, this patch fixes that. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7cd8ab0ef04..3b9f1643aa5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -619,6 +619,7 @@ retry: trans = btrfs_join_transaction(root, 1); BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -1060,6 +1061,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root, 1); } BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; cur_offset = start; -- cgit v1.2.3-70-g09d2 From 7a7eaa40a39bde4eefc91aadeb1ce3dc4e6a1252 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 12:54:33 -0400 Subject: Btrfs: take away the num_items argument from btrfs_join_transaction I keep forgetting that btrfs_join_transaction() just ignores the num_items argument, which leads me to sending pointless patches and looking stupid :). So just kill the num_items argument from btrfs_join_transaction and btrfs_start_ioctl_transaction, since neither of them use it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/extent-tree.c | 12 ++++++------ fs/btrfs/inode.c | 34 +++++++++++++++++----------------- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/relocation.c | 12 ++++++------ fs/btrfs/transaction.c | 13 +++++-------- fs/btrfs/transaction.h | 9 +++------ 7 files changed, 42 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 228cf36ece8..9d6c9e332ca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1568,7 +1568,7 @@ static int transaction_kthread(void *arg) transid = cur->transid; spin_unlock(&root->fs_info->new_trans_lock); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); @@ -2495,13 +2495,13 @@ int btrfs_commit_super(struct btrfs_root *root) down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_commit_transaction(trans, root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ee6bd55e16..941b28e7893 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3174,7 +3174,7 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3202,7 +3202,7 @@ alloc: commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); @@ -3589,7 +3589,7 @@ again: goto out; ret = -ENOSPC; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto out; ret = btrfs_commit_transaction(trans, root); @@ -3816,7 +3816,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (trans) return -EAGAIN; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); return 0; @@ -7649,7 +7649,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); @@ -8176,7 +8176,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); alloc_flags = update_block_group_flags(root, cache->flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b9f1643aa5..e47bdf0fb75 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -420,7 +420,7 @@ again: } } if (start == 0) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -617,7 +617,7 @@ retry: async_extent->start + async_extent->ram_size - 1, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_reserve_extent(trans, root, @@ -779,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode, int ret = 0; BUG_ON(root == root->fs_info->tree_root); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1056,9 +1056,9 @@ static noinline int run_delalloc_nocow(struct inode *inode, BUG_ON(!path); if (root == root->fs_info->tree_root) { nolock = true; - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); } else { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); } BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1718,9 +1718,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1735,9 +1735,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -2415,7 +2415,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) (u64)-1); if (root->orphan_block_rsv || root->orphan_item_inserted) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); } @@ -4378,9 +4378,9 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) { if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); @@ -4407,7 +4407,7 @@ void btrfs_dirty_inode(struct inode *inode) if (BTRFS_I(inode)->dummy_inode) return; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); @@ -5226,7 +5226,7 @@ again: free_extent_map(em); em = NULL; btrfs_release_path(root, path); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return ERR_CAST(trans); goto again; @@ -5470,7 +5470,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return ERR_CAST(trans); @@ -5703,7 +5703,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * to make sure the current transaction stays open * while we look for nocow cross refs */ - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto must_cow; @@ -5841,7 +5841,7 @@ again: BUG_ON(!ordered); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { err = -ENOMEM; goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2616f7ed479..908c3d4b48c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -242,7 +242,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); @@ -2182,7 +2182,7 @@ static long btrfs_ioctl_trans_start(struct file *file) mutex_unlock(&root->fs_info->trans_mutex); ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root, 0); + trans = btrfs_start_ioctl_transaction(root); if (IS_ERR(trans)) goto out_drop; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 199a8013431..8bb256667f2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2149,7 +2149,7 @@ again: err = ret; } - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(rc->extent_root, @@ -3233,7 +3233,7 @@ truncate: goto out; } - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); ret = PTR_ERR(trans); @@ -3642,7 +3642,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->create_reloc_tree = 1; set_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; @@ -3831,7 +3831,7 @@ restart: btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) err = PTR_ERR(trans); else @@ -4156,7 +4156,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { unset_reloc_control(rc); err = PTR_ERR(trans); @@ -4190,7 +4190,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) err = PTR_ERR(trans); else diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c571734d5e5..70bfb26df96 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -257,22 +257,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, { return start_transaction(root, num_items, TRANS_START); } -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN); } -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK); } -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks) +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(r, 0, TRANS_USERSPACE); + return start_transaction(root, 0, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ @@ -1171,7 +1168,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; - ac->newtrans = btrfs_join_transaction(root, 0); + ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { int err = PTR_ERR(ac->newtrans); kfree(ac); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e441acc6c58..1f573f09dba 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -92,12 +92,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -- cgit v1.2.3-70-g09d2 From 2a1eb4614d984d5cd4c928784e9afcf5c07f93be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 15:15:59 -0400 Subject: Btrfs: if we've already started a trans handle, use that one We currently track trans handles in current->journal_info, but we don't actually use it. This patch fixes it. This will cover the case where we have multiple people starting transactions down the call chain. This keeps us from having to allocate a new handle and all of that, we just increase the use count of the current handle, save the old block_rsv, and return. I tested this with xfstests and it worked out fine. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 17 +++++++++++++++++ fs/btrfs/transaction.h | 2 ++ 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 70bfb26df96..46f40564c16 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -184,6 +184,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + h = current->journal_info; + h->use_count++; + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -213,7 +222,9 @@ again: h->block_group = 0; h->bytes_reserved = 0; h->delayed_ref_updates = 0; + h->use_count = 1; h->block_rsv = NULL; + h->orig_rsv = NULL; smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -241,6 +252,7 @@ again: } } +got_it: if (type != TRANS_JOIN_NOLOCK) mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); @@ -428,6 +440,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = root->fs_info; int count = 0; + if (--trans->use_count) { + trans->block_rsv = trans->orig_rsv; + return 0; + } + while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1f573f09dba..154314f80f8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,11 +47,13 @@ struct btrfs_trans_handle { u64 transid; u64 block_group; u64 bytes_reserved; + unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; }; struct btrfs_pending_snapshot { -- cgit v1.2.3-70-g09d2 From a4abeea41adfa3c143c289045f4625dfaeba2212 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Apr 2011 17:25:13 -0400 Subject: Btrfs: kill trans_mutex We use trans_mutex for lots of things, here's a basic list 1) To serialize trans_handles joining the currently running transaction 2) To make sure that no new trans handles are started while we are committing 3) To protect the dead_roots list and the transaction lists Really the serializing trans_handles joining is not too hard, and can really get bogged down in acquiring a reference to the transaction. So replace the trans_mutex with a trans_lock spinlock and use it to do the following 1) Protect fs_info->running_transaction. All trans handles have to do is check this, and then take a reference of the transaction and keep on going. 2) Protect the fs_info->trans_list. This doesn't get used too much, basically it just holds the current transactions, which will usually just be the currently committing transaction and the currently running transaction at most. 3) Protect the dead roots list. This is only ever processed by splicing the list so this is relatively simple. 4) Protect the fs_info->reloc_ctl stuff. This is very lightweight and was using the trans_mutex before, so this is a pretty straightforward change. 5) Protect fs_info->no_trans_join. Because we don't hold the trans_lock over the entirety of the commit we need to have a way to block new people from creating a new transaction while we're doing our work. So we set no_trans_join and in join_transaction we test to see if that is set, and if it is we do a wait_on_commit. 6) Make the transaction use count atomic so we don't need to take locks to modify it when we're dropping references. 7) Add a commit_lock to the transaction to make sure multiple people trying to commit the same transaction don't race and commit at the same time. 8) Make open_ioctl_trans an atomic so we don't have to take any locks for ioctl trans. I have tested this with xfstests, but obviously it is a pretty hairy change so lots of testing is greatly appreciated. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 +- fs/btrfs/disk-io.c | 30 +++--- fs/btrfs/extent-tree.c | 3 +- fs/btrfs/file.c | 4 +- fs/btrfs/ioctl.c | 12 +-- fs/btrfs/relocation.c | 16 +-- fs/btrfs/transaction.c | 271 ++++++++++++++++++++++++++----------------------- fs/btrfs/transaction.h | 4 +- 8 files changed, 177 insertions(+), 169 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f4b81de3ae..522a39b0033 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -919,7 +919,6 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - u64 open_ioctl_trans; unsigned long mount_opt:20; unsigned long compress_type:4; u64 max_inline; @@ -936,7 +935,6 @@ struct btrfs_fs_info { struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; @@ -957,6 +955,7 @@ struct btrfs_fs_info { struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; + spinlock_t trans_lock; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -969,6 +968,7 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; /* * this is used by the balancing code to wait for all the pending @@ -1032,6 +1032,7 @@ struct btrfs_fs_info { int closing; int log_root_recovering; int enospc_unlink; + int trans_no_join; u64 total_pinned; @@ -1053,7 +1054,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; spinlock_t delalloc_lock; - spinlock_t new_trans_lock; u64 delalloc_bytes; /* data_alloc_cluster is only used in ssd mode */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9d6c9e332ca..93ef254ec43 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1551,22 +1551,22 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - spin_lock(&root->fs_info->new_trans_lock); + spin_lock(&root->fs_info->trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); goto sleep; } now = get_seconds(); if (!cur->blocked && (now < cur->start_time || now - cur->start_time < 30)) { - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; goto sleep; } transid = cur->transid; - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -1658,7 +1658,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); - spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); @@ -1687,6 +1687,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; + fs_info->trans_no_join = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1735,7 +1736,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->do_barriers = 1; - mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); @@ -3006,10 +3006,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) WARN_ON(1); - mutex_lock(&root->fs_info->trans_mutex); mutex_lock(&root->fs_info->transaction_kthread_mutex); + spin_lock(&root->fs_info->trans_lock); list_splice_init(&root->fs_info->trans_list, &list); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); if (!t) @@ -3034,23 +3037,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) t->blocked = 0; if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); - mutex_lock(&root->fs_info->trans_mutex); t->commit_done = 1; if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); - mutex_unlock(&root->fs_info->trans_mutex); - - mutex_lock(&root->fs_info->trans_mutex); btrfs_destroy_pending_snapshots(t); btrfs_destroy_delalloc_inodes(root); - spin_lock(&root->fs_info->new_trans_lock); + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); btrfs_destroy_marked_extents(root, &t->dirty_pages, EXTENT_DIRTY); @@ -3064,8 +3062,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) kmem_cache_free(btrfs_transaction_cachep, t); } + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 941b28e7893..ca599654ce1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3200,7 +3200,8 @@ alloc: /* commit the current transaction and try again */ commit_trans: - if (!committed && !root->fs_info->open_ioctl_trans) { + if (!committed && + !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 75899a01dde..cd5e82e500c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1222,14 +1222,12 @@ int btrfs_sync_file(struct file *file, int datasync) * the current transaction, we can bail out now without any * syncing */ - mutex_lock(&root->fs_info->trans_mutex); + smp_mb(); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; - mutex_unlock(&root->fs_info->trans_mutex); goto out; } - mutex_unlock(&root->fs_info->trans_mutex); /* * ok we haven't committed the transaction yet, lets do a commit diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 908c3d4b48c..a578620e06a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2177,9 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (ret) goto out; - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans++; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_inc(&root->fs_info->open_ioctl_trans); ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root); @@ -2190,9 +2188,7 @@ static long btrfs_ioctl_trans_start(struct file *file) return 0; out_drop: - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); mnt_drop_write(file->f_path.mnt); out: return ret; @@ -2426,9 +2422,7 @@ long btrfs_ioctl_trans_end(struct file *file) btrfs_end_transaction(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); mnt_drop_write(file->f_path.mnt); return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8bb256667f2..09c30d37d43 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2136,10 +2136,10 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2208,9 +2208,9 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); list_splice_init(&rc->reloc_roots, &reloc_roots); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); while (!list_empty(&reloc_roots)) { found = 1; @@ -3583,17 +3583,17 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); fs_info->reloc_ctl = rc; - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); fs_info->reloc_ctl = NULL; - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46f40564c16..43816f8b23e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -34,6 +34,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -48,47 +49,73 @@ static noinline void switch_commit_root(struct btrfs_root *root) /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root) +static noinline int join_transaction(struct btrfs_root *root, int nofail) { struct btrfs_transaction *cur_trans; + + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->trans_no_join) { + if (!nofail) { + spin_unlock(&root->fs_info->trans_lock); + return -EBUSY; + } + } + cur_trans = root->fs_info->running_transaction; - if (!cur_trans) { - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, - GFP_NOFS); - if (!cur_trans) - return -ENOMEM; - root->fs_info->generation++; - atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; - cur_trans->transid = root->fs_info->generation; - init_waitqueue_head(&cur_trans->writer_wait); - init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; - atomic_set(&cur_trans->use_count, 1); - cur_trans->commit_done = 0; - cur_trans->start_time = get_seconds(); - - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - spin_lock_init(&cur_trans->delayed_refs.lock); - - INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping, - GFP_NOFS); - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = cur_trans; - spin_unlock(&root->fs_info->new_trans_lock); - } else { + if (cur_trans) { + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; + } + spin_unlock(&root->fs_info->trans_lock); + + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->running_transaction) { + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + cur_trans = root->fs_info->running_transaction; + atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; } + atomic_set(&cur_trans->num_writers, 1); + cur_trans->num_joined = 0; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + atomic_set(&cur_trans->use_count, 2); + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root = RB_ROOT; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->commit_lock); + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); + root->fs_info->generation++; + cur_trans->transid = root->fs_info->generation; + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + spin_lock(&root->fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid) { + spin_unlock(&root->fs_info->fs_roots_radix_lock); + return 0; + } + root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); - root->last_trans = trans->transid; + spin_unlock(&root->fs_info->fs_roots_radix_lock); btrfs_init_reloc_root(trans, root); } return 0; } -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!root->ref_cows) - return 0; - - mutex_lock(&root->fs_info->trans_mutex); - if (root->last_trans == trans->transid) { - mutex_unlock(&root->fs_info->trans_mutex); - return 0; - } - - record_root_in_trans(trans, root); - mutex_unlock(&root->fs_info->trans_mutex); - return 0; -} - /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; + spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { DEFINE_WAIT(wait); atomic_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); while (1) { prepare_to_wait(&root->fs_info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); if (!cur_trans->blocked) break; - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); } finish_wait(&root->fs_info->transaction_wait, &wait); put_transaction(cur_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } } @@ -167,10 +185,16 @@ enum btrfs_trans_type { static int may_wait_transaction(struct btrfs_root *root, int type) { - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + if (root->fs_info->log_root_recovering) + return 0; + + if (type == TRANS_USERSPACE) + return 1; + + if (type == TRANS_START && + !atomic_read(&root->fs_info->open_ioctl_trans)) return 1; + return 0; } @@ -198,23 +222,21 @@ again: if (!h) return ERR_PTR(-ENOMEM); - if (type != TRANS_JOIN_NOLOCK) - mutex_lock(&root->fs_info->trans_mutex); if (may_wait_transaction(root, type)) wait_current_trans(root); - ret = join_transaction(root); + do { + ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); + if (ret == -EBUSY) + wait_current_trans(root); + } while (ret == -EBUSY); + if (ret < 0) { kmem_cache_free(btrfs_trans_handle_cachep, h); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); return ERR_PTR(ret); } cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); h->transid = cur_trans->transid; h->transaction = cur_trans; @@ -253,11 +275,7 @@ again: } got_it: - if (type != TRANS_JOIN_NOLOCK) - mutex_lock(&root->fs_info->trans_mutex); - record_root_in_trans(h, root); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); + btrfs_record_root_in_trans(h, root); if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; @@ -289,17 +307,13 @@ static noinline int wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { DEFINE_WAIT(wait); - mutex_lock(&root->fs_info->trans_mutex); while (!commit->commit_done) { prepare_to_wait(&commit->commit_wait, &wait, TASK_UNINTERRUPTIBLE); if (commit->commit_done) break; - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); } - mutex_unlock(&root->fs_info->trans_mutex); finish_wait(&commit->commit_wait, &wait); return 0; } @@ -309,50 +323,49 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) struct btrfs_transaction *cur_trans = NULL, *t; int ret; - mutex_lock(&root->fs_info->trans_mutex); - ret = 0; if (transid) { if (transid <= root->fs_info->last_trans_committed) - goto out_unlock; + goto out; /* find specified transaction */ + spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; + atomic_inc(&cur_trans->use_count); break; } if (t->transid > transid) break; } + spin_unlock(&root->fs_info->trans_lock); ret = -EINVAL; if (!cur_trans) - goto out_unlock; /* bad transid */ + goto out; /* bad transid */ } else { /* find newest transaction that is committing | committed */ + spin_lock(&root->fs_info->trans_lock); list_for_each_entry_reverse(t, &root->fs_info->trans_list, list) { if (t->in_commit) { if (t->commit_done) - goto out_unlock; + goto out; cur_trans = t; + atomic_inc(&cur_trans->use_count); break; } } + spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) - goto out_unlock; /* nothing committing|committed */ + goto out; /* nothing committing|committed */ } - atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); - wait_for_commit(root, cur_trans); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(cur_trans); ret = 0; -out_unlock: - mutex_unlock(&root->fs_info->trans_mutex); +out: return ret; } @@ -401,10 +414,8 @@ harder: void btrfs_throttle(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->open_ioctl_trans) + if (!atomic_read(&root->fs_info->open_ioctl_trans)) wait_current_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); } static int should_end_transaction(struct btrfs_trans_handle *trans, @@ -422,6 +433,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; int updates; + smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; @@ -467,9 +479,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); - if (lock && !root->fs_info->open_ioctl_trans && - should_end_transaction(trans, root)) + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && + should_end_transaction(trans, root)) { trans->transaction->blocked = 1; + smp_wmb(); + } if (lock && cur_trans->blocked && !cur_trans->in_commit) { if (throttle) @@ -739,9 +753,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, */ int btrfs_add_dead_root(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); list_add(&root->root_list, &root->fs_info->dead_roots); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -757,6 +771,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int ret; int err = 0; + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, @@ -769,6 +784,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); @@ -783,10 +799,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; } } + spin_unlock(&fs_info->fs_roots_radix_lock); return err; } @@ -972,7 +990,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + btrfs_record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -990,7 +1008,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1080,20 +1098,20 @@ static void update_super_roots(struct btrfs_root *root) int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { int ret = 0; - spin_lock(&info->new_trans_lock); + spin_lock(&info->trans_lock); if (info->running_transaction) ret = info->running_transaction->in_commit; - spin_unlock(&info->new_trans_lock); + spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { int ret = 0; - spin_lock(&info->new_trans_lock); + spin_lock(&info->trans_lock); if (info->running_transaction) ret = info->running_transaction->blocked; - spin_unlock(&info->new_trans_lock); + spin_unlock(&info->trans_lock); return ret; } @@ -1117,9 +1135,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root, &wait); break; } - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&root->fs_info->transaction_blocked_wait, &wait); } } @@ -1145,9 +1161,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, &wait); break; } - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&root->fs_info->transaction_wait, &wait); } @@ -1193,22 +1207,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, } /* take transaction reference */ - mutex_lock(&root->fs_info->trans_mutex); cur_trans = trans->transaction; atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); schedule_delayed_work(&ac->work, 0); /* wait for transaction to start and unblock */ - mutex_lock(&root->fs_info->trans_mutex); if (wait_for_unblock) wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } @@ -1252,38 +1262,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { + spin_unlock(&cur_trans->commit_lock); atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); ret = wait_for_commit(root, cur_trans); BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } trans->transaction->in_commit = 1; trans->transaction->blocked = 1; + spin_unlock(&cur_trans->commit_lock); wake_up(&root->fs_info->transaction_blocked_wait); + spin_lock(&root->fs_info->trans_lock); if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (!prev_trans->commit_done) { atomic_inc(&prev_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(prev_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } + } else { + spin_unlock(&root->fs_info->trans_lock); } if (now < cur_trans->start_time || now - cur_trans->start_time < 1) @@ -1291,12 +1304,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, do { int snap_pending = 0; + joined = cur_trans->num_joined; if (!list_empty(&trans->transaction->pending_snapshots)) snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); @@ -1316,14 +1329,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); - smp_mb(); if (atomic_read(&cur_trans->num_writers) > 1) schedule_timeout(MAX_SCHEDULE_TIMEOUT); else if (should_grow) schedule_timeout(1); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); @@ -1364,9 +1378,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); @@ -1387,10 +1398,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, sizeof(root->fs_info->super_copy)); trans->transaction->blocked = 0; + spin_lock(&root->fs_info->trans_lock); + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); ret = btrfs_write_and_wait_transaction(trans, root); BUG_ON(ret); write_ctree_super(trans, root, 0); @@ -1403,22 +1417,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); + spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + put_transaction(cur_trans); put_transaction(cur_trans); trace_btrfs_transaction_commit(root); - mutex_unlock(&root->fs_info->trans_mutex); - if (current->journal_info == trans) current->journal_info = NULL; @@ -1438,9 +1451,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); list_splice_init(&fs_info->dead_roots, &list); - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 154314f80f8..11c6efcd4ed 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -28,10 +28,12 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; + atomic_t use_count; unsigned long num_joined; + + spinlock_t commit_lock; int in_commit; - atomic_t use_count; int commit_done; int blocked; struct list_head list; -- cgit v1.2.3-70-g09d2 From fcb80c2affd63237cff5b34cba5756be7c976a5a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 May 2011 10:40:22 -0400 Subject: Btrfs: fix how we do space reservation for truncate The ceph guys keep running into problems where we have space reserved in our orphan block rsv when freeing it up. This is because they tend to do snapshots alot, so their truncates tend to use a bunch of space, so when we go to do things like update the inode we have to steal reservation space in order to make the reservation happen. This happens because truncate can use as much space as it freaking feels like, but we still have to hold space for removing the orphan item and updating the inode, which will definitely always happen. So in order to fix this we need to split all of the reservation stuf up. So with this patch we have 1) The orphan block reserve which only holds the space for deleting our orphan item when everything is over. 2) The truncate block reserve which gets allocated and used specifically for the space that the truncate will use on a per truncate basis. 3) The transaction will always have 1 item's worth of data reserved so we can update the inode normally. Hopefully this will make the ceph problem go away. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 ++ fs/btrfs/extent-tree.c | 46 +++++++++++++++----- fs/btrfs/inode.c | 111 +++++++++++++++++++++++++++++++++++++------------ 3 files changed, 123 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 522a39b0033..f31aed7fedd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2224,6 +2224,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ca599654ce1..a2ca561c70f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3980,6 +3980,37 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) 3 * num_items; } +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; + u64 num_bytes; + int ret; + + /* + * Truncate should be freeing data, but give us 2 items just in case it + * needs to use some space. We may want to be smarter about this in the + * future. + */ + num_bytes = calc_trans_metadata_size(root, 2); + + /* We already have enough bytes, just return */ + if (rsv->reserved >= num_bytes) + return 0; + + num_bytes -= rsv->reserved; + + /* + * You should have reserved enough space before hand to do this, so this + * should not fail. + */ + ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); + BUG_ON(ret); + + return 0; +} + int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, int num_items) @@ -4020,23 +4051,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* - * one for deleting orphan item, one for updating inode and - * two for calling btrfs_truncate_inode_items. - * - * btrfs_truncate_inode_items is a delete operation, it frees - * more space than it uses in most cases. So two units of - * metadata space should be enough for calling it many times. - * If all of the metadata space is used, we can commit - * transaction and use space it freed. + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. */ - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = calc_trans_metadata_size(root, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = calc_trans_metadata_size(root, 1); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e47bdf0fb75..bc12ba23db5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6591,6 +6591,7 @@ out: static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv; int ret; int err = 0; struct btrfs_trans_handle *trans; @@ -6604,28 +6605,83 @@ static int btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + /* + * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * 3 things going on here + * + * 1) We need to reserve space for our orphan item and the space to + * delete our orphan item. Lord knows we don't want to have a dangling + * orphan item because we didn't reserve space to remove it. + * + * 2) We need to reserve space to update our inode. + * + * 3) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to all be seperate. The fact is we can use alot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be seperate so it + * doesn't end up using space reserved for updating the inode or + * removing the orphan item. We also need to be able to stop the + * transaction and start a new one, which means we need to be able to + * update the inode several times, and we have no idea of knowing how + * many times that will be, so we can't just reserve 1 item for the + * entirety of the opration, so that has to be done seperately as well. + * Then there is the orphan item, which does indeed need to be held on + * to for the whole operation, and we need nobody to touch this reserved + * space except the orphan code. + * + * So that leaves us with + * + * 1) root->orphan_block_rsv - for the orphan deletion. + * 2) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) + return -ENOMEM; + btrfs_add_durable_block_rsv(root->fs_info, rsv); + + trans = btrfs_start_transaction(root, 4); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } btrfs_set_trans_block_group(trans, inode); + /* + * Reserve space for the truncate process. Truncate should be adding + * space, but if there are snapshots it may end up using space. + */ + ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + BUG_ON(ret); + ret = btrfs_orphan_add(trans, inode); if (ret) { btrfs_end_transaction(trans, root); - return ret; + goto out; } nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); - /* Now start a transaction for the truncate */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + /* + * Ok so we've already migrated our bytes over for the truncate, so here + * just reserve the one slot we need for updating the inode. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; + trans->block_rsv = rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@ -6649,24 +6705,18 @@ static int btrfs_truncate(struct inode *inode) while (1) { if (!trans) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; - } + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); - if (ret == -EAGAIN) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - trans = NULL; - continue; - } else if (ret) { - err = ret; - break; + ret = btrfs_truncate_reserve_metadata(trans, root, + rsv); + BUG_ON(ret); + + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = rsv; } ret = btrfs_truncate_inode_items(trans, root, inode, @@ -6677,6 +6727,7 @@ static int btrfs_truncate(struct inode *inode) break; } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; @@ -6690,6 +6741,7 @@ static int btrfs_truncate(struct inode *inode) } if (ret == 0 && inode->i_nlink > 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; @@ -6701,15 +6753,20 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(NULL, inode); } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret && !err) err = ret; nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + +out: + btrfs_free_block_rsv(root, rsv); + if (ret && !err) err = ret; - btrfs_btree_balance_dirty(root, nr); return err; } -- cgit v1.2.3-70-g09d2 From af60bed24eb0e3b6d93eaa6bb395a5721e6c09a8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 May 2011 11:11:17 -0400 Subject: Btrfs: set range_start to the right start in count_range_bits In count_range_bits we are adjusting total_bytes based on the range we are searching for, but we don't adjust the range start according to the range we are searching for, which makes for weird results. For example, if the range [0-8192] is set DELALLOC, but I search for 4096-8192, I will get back 4096 for the number of bytes found, but the range_start will be 0, which makes it look like the range is [0-4096]. So instead set range_start = max(cur_start, state->start). This makes everything come out right. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ba41da59e31..b5f6f227a97 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1480,7 +1480,7 @@ u64 count_range_bits(struct extent_io_tree *tree, if (total_bytes >= max_bytes) break; if (!found) { - *start = state->start; + *start = max(cur_start, state->start); found = 1; } last = state->end; -- cgit v1.2.3-70-g09d2 From cb25c2ea6a79702ab7895b873c6c43e0d3bc3c72 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 12:17:34 -0400 Subject: Btrfs: map the node block when looking for readahead targets If we have particularly full nodes, we could call btrfs_node_blockptr up to 32 times, which is 32 pairs of kmap/kunmap, which _sucks_. So go ahead and map the extent buffer while we look for readahead targets. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 84d7ca1fe0b..009bcf7f1e4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1229,6 +1229,7 @@ static void reada_for_search(struct btrfs_root *root, u64 search; u64 target; u64 nread = 0; + u64 gen; int direction = path->reada; struct extent_buffer *eb; u32 nr; @@ -1256,6 +1257,15 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; while (1) { + if (!node->map_token) { + unsigned long offset = btrfs_node_key_ptr_offset(nr); + map_private_extent_buffer(node, offset, + sizeof(struct btrfs_key_ptr), + &node->map_token, + &node->kaddr, + &node->map_start, + &node->map_len, KM_USER1); + } if (direction < 0) { if (nr == 0) break; @@ -1273,14 +1283,23 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(root, search, blocksize, - btrfs_node_ptr_generation(node, nr)); + gen = btrfs_node_ptr_generation(node, nr); + if (node->map_token) { + unmap_extent_buffer(node, node->map_token, + KM_USER1); + node->map_token = NULL; + } + readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } nscan++; if ((nread > 65536 || nscan > 32)) break; } + if (node->map_token) { + unmap_extent_buffer(node, node->map_token, KM_USER1); + node->map_token = NULL; + } } /* -- cgit v1.2.3-70-g09d2 From 7e2355ba1a11649f0b212a29fdb9f47476f1248e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 12:25:37 -0400 Subject: Btrfs: don't look at the extent buffer level 3 times in a row We have a bit of debugging in btrfs_search_slot to make sure the level of the cow block is the same as the original block we were cow'ing. I don't think I've ever seen this tripped, so kill it. This saves us 2 kmap's per level in our search. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 009bcf7f1e4..f7a0a64b868 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1672,9 +1672,6 @@ again: } cow_done: BUG_ON(!cow && ins_len); - if (level != btrfs_header_level(b)) - WARN_ON(1); - level = btrfs_header_level(b); p->nodes[level] = b; if (!p->skip_locking) -- cgit v1.2.3-70-g09d2 From d82a6f1d7e8b61ed5996334d0db66651bb43641d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 15:26:06 -0400 Subject: Btrfs: kill BTRFS_I(inode)->block_group Originally this was going to be used as a way to give hints to the allocator, but frankly we can get much better hints elsewhere and it's not even used at all for anything usefull. In addition to be completely useless, when we initialize an inode we try and find a freeish block group to set as the inodes block group, and with a completely full 40gb fs this takes _forever_, so I imagine with say 1tb fs this is just unbearable. So just axe the thing altoghether, we don't need it and it saves us 8 bytes in the inode and saves us 500 microseconds per inode lookup in my testcase. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 3 -- fs/btrfs/ctree.h | 3 +- fs/btrfs/extent-tree.c | 10 ++---- fs/btrfs/inode.c | 87 ++++++-------------------------------------------- fs/btrfs/ioctl.c | 3 +- fs/btrfs/transaction.c | 1 - fs/btrfs/transaction.h | 14 -------- fs/btrfs/xattr.c | 2 -- 8 files changed, 13 insertions(+), 110 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 57c3bb2884c..4bc852d3b83 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -120,9 +120,6 @@ struct btrfs_inode { */ u64 index_cnt; - /* the start of block group preferred for allocations. */ - u64 block_group; - /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f31aed7fedd..0f8c489bcc0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2512,8 +2512,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint); + struct btrfs_root *new_root, u64 new_dirid); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a2ca561c70f..9f0a4e3bd8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5319,6 +5319,7 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); + btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; @@ -5411,14 +5412,7 @@ loop: ret = -ENOSPC; } else if (!ins->objectid) { ret = -ENOSPC; - } - - /* we found what we needed */ - if (ins->objectid) { - if (!(data & BTRFS_BLOCK_GROUP_DATA)) - trans->block_group = block_group->key.objectid; - - btrfs_put_block_group(block_group); + } else if (ins->objectid) { ret = 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bc12ba23db5..dd5938a7de2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -136,7 +136,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; - btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; key.offset = start; @@ -422,7 +421,6 @@ again: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ @@ -781,7 +779,6 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); @@ -1502,8 +1499,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, { struct btrfs_ordered_sum *sum; - btrfs_set_trans_block_group(trans, inode); - list_for_each_entry(sum, list, list) { btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); @@ -1722,7 +1717,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) else trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -1739,7 +1733,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) else trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2495,7 +2488,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; int maybe_acls; - u64 alloc_group_block; u32 rdev; int ret; @@ -2539,8 +2531,6 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - alloc_group_block = btrfs_inode_block_group(leaf, inode_item); - /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls @@ -2549,8 +2539,6 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!maybe_acls) cache_no_acl(inode); - BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, - alloc_group_block, 0); btrfs_free_path(path); inode_item = NULL; @@ -2630,7 +2618,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); + btrfs_set_inode_block_group(leaf, item, 0); if (leaf->map_token) { unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); @@ -2971,8 +2959,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, @@ -3068,8 +3054,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, root, dir, BTRFS_I(inode)->location.objectid, @@ -3649,7 +3633,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = PTR_ERR(trans); break; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, @@ -3785,7 +3768,6 @@ void btrfs_evict_inode(struct inode *inode) while (1) { trans = btrfs_start_transaction(root, 0); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; ret = btrfs_block_rsv_check(trans, root, @@ -4383,7 +4365,6 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); else @@ -4409,7 +4390,6 @@ void btrfs_dirty_inode(struct inode *inode) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); if (ret && ret == -ENOSPC) { @@ -4424,7 +4404,6 @@ void btrfs_dirty_inode(struct inode *inode) } return; } - btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); if (ret) { @@ -4519,8 +4498,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, const char *name, int name_len, - u64 ref_objectid, u64 objectid, - u64 alloc_hint, int mode, u64 *index) + u64 ref_objectid, u64 objectid, int mode, + u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; @@ -4567,8 +4546,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 0; else owner = 1; - BTRFS_I(inode)->block_group = - btrfs_find_block_group(root, 0, alloc_hint, owner); key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -4729,11 +4706,9 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, mode, &index); + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -4745,7 +4720,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -4754,8 +4728,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -4791,11 +4763,9 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, mode, &index); + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -4807,7 +4777,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -4818,8 +4787,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -4866,8 +4833,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_inc_nlink(inode); inode->i_ctime = CURRENT_TIME; - - btrfs_set_trans_block_group(trans, dir); ihold(inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); @@ -4876,7 +4841,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dget_parent(dentry); - btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); btrfs_log_new_name(trans, inode, NULL, parent); @@ -4917,12 +4881,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, S_IFDIR | mode, - &index); + S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fail; @@ -4936,7 +4898,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); @@ -4950,8 +4911,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_fail: nr = trans->blocks_used; @@ -6652,8 +6611,6 @@ static int btrfs_truncate(struct inode *inode) goto out; } - btrfs_set_trans_block_group(trans, inode); - /* * Reserve space for the truncate process. Truncate should be adding * space, but if there are snapshots it may end up using space. @@ -6680,7 +6637,6 @@ static int btrfs_truncate(struct inode *inode) err = PTR_ERR(trans); goto out; } - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = rsv; /* @@ -6715,7 +6671,6 @@ static int btrfs_truncate(struct inode *inode) rsv); BUG_ON(ret); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = rsv; } @@ -6775,15 +6730,14 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint) + struct btrfs_root *new_root, u64 new_dirid) { struct inode *inode; int err; u64 index = 0; inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, alloc_hint, S_IFDIR | 0700, &index); + new_dirid, S_IFDIR | 0700, &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; @@ -6893,21 +6847,6 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - if (root == root->fs_info->tree_root) { - struct btrfs_block_group_cache *block_group; - - block_group = btrfs_lookup_block_group(root->fs_info, - BTRFS_I(inode)->block_group); - if (block_group && block_group->inode == inode) { - spin_lock(&block_group->lock); - block_group->inode = NULL; - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - } else if (block_group) { - btrfs_put_block_group(block_group); - } - } - spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", @@ -7091,8 +7030,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - btrfs_set_trans_block_group(trans, new_dir); - if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -7331,12 +7268,9 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, - &index); + S_IFLNK|S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -7348,7 +7282,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -7359,8 +7292,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); if (drop_inode) goto out_unlock; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a578620e06a..8e90ccf4b76 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -413,8 +413,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid, - BTRFS_I(dir)->block_group); + ret = btrfs_create_subvol_root(trans, new_root, new_dirid); /* * insert the directory item */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43816f8b23e..f4ea695325b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -241,7 +241,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; h->blocks_used = 0; - h->block_group = 0; h->bytes_reserved = 0; h->delayed_ref_updates = 0; h->use_count = 1; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 11c6efcd4ed..da7289e06a8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,7 +47,6 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; - u64 block_group; u64 bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -70,19 +69,6 @@ struct btrfs_pending_snapshot { struct list_head list; }; -static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - trans->block_group = BTRFS_I(inode)->block_group; -} - -static inline void btrfs_update_inode_block_group( - struct btrfs_trans_handle *trans, - struct inode *inode) -{ - BTRFS_I(inode)->block_group = trans->block_group; -} - static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index cfd660550de..72ab0295ca7 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - ret = do_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; -- cgit v1.2.3-70-g09d2 From 589d8ade83f07c0f11c8191c0ca309f34d7a2c14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 17:30:53 -0400 Subject: Btrfs: try not to sleep as much when doing slow caching When the fs is super full and we unmount the fs, we could get stuck in this thing where unmount is waiting for the caching kthread to make progress and the caching kthread keeps scheduling because we're in the middle of a commit. So instead just let the caching kthread keep going and only yeild if need_resched(). This makes my horrible umount case go from taking up to 10 minutes to taking less than 20 seconds. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9f0a4e3bd8a..96be6245031 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -378,15 +378,18 @@ again: if (ret) break; - caching_ctl->progress = last; - btrfs_release_path(extent_root, path); - up_read(&fs_info->extent_commit_sem); - mutex_unlock(&caching_ctl->mutex); - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - else + if (need_resched() || + btrfs_next_leaf(extent_root, path)) { + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + goto again; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; } if (key.objectid < block_group->key.objectid) { -- cgit v1.2.3-70-g09d2 From 026fd317828500524cdc7e5ff9e8e7923abb2868 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 10:32:11 -0400 Subject: Btrfs: don't always do readahead Our readahead is sort of sloppy, and really isn't always needed. For example if ls is doing a stating ls (which is the default) it's going to stat in non-disk order, so if say you have a directory with a stupid amount of files, readahead is going to do nothing but waste time in the case of doing the stat. Taking the unconditional readahead out made my test go from 57 minutes to 36 minutes. This means that everywhere we do loop through the tree we want to make sure we do set path->reada properly, so I went through and found all of the places where we loop through the path and set reada to 1. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 2 -- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/inode.c | 14 ++++++++++++-- fs/btrfs/relocation.c | 6 ++++++ 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f7a0a64b868..f61c16c1481 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -48,8 +48,6 @@ struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); - if (path) - path->reada = 1; return path; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96be6245031..1ba2cc58eab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -347,7 +347,7 @@ static int caching_kthread(void *data) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = 1; key.objectid = last; key.offset = 0; @@ -8556,6 +8556,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); if (cache_gen != 0 && diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd5938a7de2..6228a304b54 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4242,7 +4242,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, filp->f_pos = 2; } path = btrfs_alloc_path(); - path->reada = 2; + if (!path) + return -ENOMEM; + path->reada = 1; btrfs_set_key_type(&key, key_type); key.offset = filp->f_pos; @@ -5043,7 +5045,15 @@ again: if (!path) { path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } + /* + * Chances are we'll be called again, so go ahead and do + * readahead + */ + path->reada = 1; } ret = btrfs_lookup_file_extent(trans, root, path, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 09c30d37d43..5872b41581f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -676,6 +676,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } + path1->reada = 1; + path2->reada = 2; node = alloc_backref_node(cache); if (!node) { @@ -1996,6 +1998,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -3297,6 +3300,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3665,6 +3669,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; ret = prepare_to_relocate(rc); if (ret) { @@ -4090,6 +4095,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = -1; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; -- cgit v1.2.3-70-g09d2 From cca1c81f43e26ab60c0d1090fb90992358d69bdf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 11:07:12 -0400 Subject: Btrfs: don't try to allocate from a block group that doesn't have enough space If we have a very large filesystem, we can spend a lot of time in find_free_extent just trying to allocate from empty block groups. So instead check to see if the block group even has enough space for the allocation, and if not go on to the next block group. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1ba2cc58eab..c8c318494de 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5159,6 +5159,14 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; + spin_lock(&block_group->tree_lock); + if (cached && + block_group->free_space < num_bytes + empty_size) { + spin_unlock(&block_group->tree_lock); + goto loop; + } + spin_unlock(&block_group->tree_lock); + /* * Ok we want to try and use the cluster allocator, so lets look * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will -- cgit v1.2.3-70-g09d2 From 207dde8289d9b005b665cb9d8d2bb9464256101d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 14:49:23 -0400 Subject: Btrfs: check for duplicate entries in the free space cache If there are duplicate entries in the free space cache, discard the entire cache and load it the old fashioned way. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 63731a1fb0a..d634a7e4220 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -420,7 +420,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_lock(&block_group->tree_lock); ret = link_free_space(block_group, e); spin_unlock(&block_group->tree_lock); - BUG_ON(ret); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } } else { e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { @@ -437,6 +444,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, recalculate_thresholds(block_group); spin_unlock(&block_group->tree_lock); list_add_tail(&e->list, &bitmaps); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } } num_entries--; @@ -909,10 +924,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * logically. */ if (bitmap) { - WARN_ON(info->bitmap); + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_right; } else { - WARN_ON(!info->bitmap); + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_left; } } -- cgit v1.2.3-70-g09d2 From d90c732122a1f6d0efe388a8a204f67f144b2eb3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 May 2011 09:50:54 -0400 Subject: Btrfs: leave spinning on lookup and map the leaf On lookup we only want to read the inode item, so leave the path spinning. Also we're just wholesale reading the leaf off, so map the leaf so we don't do a bunch of kmap/kunmaps. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6228a304b54..dc8fb2b3a14 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2493,6 +2493,7 @@ static void btrfs_read_locked_inode(struct inode *inode) path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -2502,6 +2503,12 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); inode->i_mode = btrfs_inode_mode(leaf, inode_item); inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); @@ -2539,6 +2546,11 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!maybe_acls) cache_no_acl(inode); + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + btrfs_free_path(path); inode_item = NULL; -- cgit v1.2.3-70-g09d2 From f90e5b5b136ede1f0fd15999e95f13124d6b0dbd Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 24 May 2011 10:44:42 -0400 Subject: GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a2a6abbccc0..7137750f17f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -663,14 +663,19 @@ static void glock_work_func(struct work_struct *work) drop_ref = 1; } spin_lock(&gl->gl_spin); - if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { unsigned long holdtime, now = jiffies; + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; - set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags); + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); + } } run_queue(gl, 0); spin_unlock(&gl->gl_spin); -- cgit v1.2.3-70-g09d2 From 1adffbae22332bb558c2a29de19d9aca391869f6 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 31 May 2011 19:38:07 +0900 Subject: fat: Fix corrupt inode flags when remove ATTR_SYS flag We are clearly missing '~' in fat_ioctl_set_attributes(). Cc: Reported-by: Dmitry Dmitriev Signed-off-by: OGAWA Hirofumi --- fs/fat/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index 7257752b6d5..7018e1d8902 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -102,7 +102,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (attr & ATTR_SYS) inode->i_flags |= S_IMMUTABLE; else - inode->i_flags &= S_IMMUTABLE; + inode->i_flags &= ~S_IMMUTABLE; } fat_save_attrs(inode, attr); -- cgit v1.2.3-70-g09d2 From fe47ae7f53e179d2ef6771024feb000cbb86640f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 31 May 2011 12:35:41 +0200 Subject: oprofile, dcookies: Fix possible circular locking dependency The lockdep warning below detects a possible A->B/B->A locking dependency of mm->mmap_sem and dcookie_mutex. The order in sync_buffer() is mm->mmap_sem/dcookie_mutex, while in sys_lookup_dcookie() it is vice versa. Fixing it in sys_lookup_dcookie() by unlocking dcookie_mutex before copy_to_user(). oprofiled/4432 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x53/0xa3 but task is already holding lock: (dcookie_mutex){+.+.+.}, at: [] sys_lookup_dcookie+0x45/0x149 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (dcookie_mutex){+.+.+.}: [] lock_acquire+0xf8/0x11e [] mutex_lock_nested+0x63/0x309 [] get_dcookie+0x30/0x144 [] sync_buffer+0x196/0x3ec [oprofile] [] task_exit_notify+0x16/0x1a [oprofile] [] notifier_call_chain+0x37/0x63 [] __blocking_notifier_call_chain+0x50/0x67 [] blocking_notifier_call_chain+0x14/0x16 [] profile_task_exit+0x1a/0x1c [] do_exit+0x2a/0x6fc [] do_group_exit+0x83/0xae [] sys_exit_group+0x17/0x1b [] system_call_fastpath+0x16/0x1b -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0x1085/0x1711 [] lock_acquire+0xf8/0x11e [] might_fault+0x80/0xa3 [] sys_lookup_dcookie+0x104/0x149 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: 1 lock held by oprofiled/4432: #0: (dcookie_mutex){+.+.+.}, at: [] sys_lookup_dcookie+0x45/0x149 stack backtrace: Pid: 4432, comm: oprofiled Not tainted 2.6.39-00008-ge5a450d #9 Call Trace: [] print_circular_bug+0xae/0xbc [] __lock_acquire+0x1085/0x1711 [] ? get_parent_ip+0x11/0x42 [] ? might_fault+0x53/0xa3 [] lock_acquire+0xf8/0x11e [] ? might_fault+0x53/0xa3 [] ? path_put+0x22/0x27 [] might_fault+0x80/0xa3 [] ? might_fault+0x53/0xa3 [] sys_lookup_dcookie+0x104/0x149 [] system_call_fastpath+0x16/0x1b References: https://bugzilla.kernel.org/show_bug.cgi?id=13809 Cc: # .27+ Signed-off-by: Robert Richter --- fs/dcookies.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/dcookies.c b/fs/dcookies.c index a21cabdbd87..dda0dc702d1 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -178,6 +178,8 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) /* FIXME: (deleted) ? */ path = d_path(&dcs->path, kbuf, PAGE_SIZE); + mutex_unlock(&dcookie_mutex); + if (IS_ERR(path)) { err = PTR_ERR(path); goto out_free; @@ -194,6 +196,7 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) out_free: kfree(kbuf); + return err; out: mutex_unlock(&dcookie_mutex); return err; -- cgit v1.2.3-70-g09d2 From c592a7073796de1cd221f957ab693c19d383423f Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Fri, 3 Jun 2011 12:06:19 +0530 Subject: cifs: fix the kernel release version in the default security warning message When ntlm security mechanim is used, the message that warns about the upgrade to ntlmv2 got the kernel release version wrong (Blame it on Linus :). Fix it. Signed-off-by: Suresh Jayaraman Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6d88b82537c..84c730701c0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1976,7 +1976,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 2.6.41"); + "ntlmv2 in kernel release 3.1"); } ses->overrideSecFlg = volume_info->secFlg; -- cgit v1.2.3-70-g09d2 From 5f0b23eeba2d9105944148e5a85b0bfb34a8ecf5 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Fri, 3 Jun 2011 14:19:01 +0530 Subject: cifs: make CIFS depend on CRYPTO_ECB When CONFIG_CRYPTO_ECB is not set, trying to mount a CIFS share with NTLM security resulted in mount failure with the following error: "CIFS VFS: could not allocate des crypto API" Seems like a leftover from commit 43988d7. Signed-off-by: Suresh Jayaraman CC: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 1cd4c3a1862..66c68ab9e7d 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -7,6 +7,7 @@ config CIFS select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ARC4 + select CRYPTO_ECB select CRYPTO_DES help This is the client VFS module for the Common Internet File System -- cgit v1.2.3-70-g09d2 From 9e1f1de02c2275d7172e18dc4e7c2065777611bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 18:24:58 -0400 Subject: more conservative S_NOSEC handling Caching "we have already removed suid/caps" was overenthusiastic as merged. On network filesystems we might have had suid/caps set on another client, silently picked by this client on revalidate, all of that *without* clearing the S_NOSEC flag. AFAICS, the only reasonably sane way to deal with that is * new superblock flag; unless set, S_NOSEC is not going to be set. * local block filesystems set it in their ->mount() (more accurately, mount_bdev() does, so does btrfs ->mount(), users of mount_bdev() other than local block ones clear it) * if any network filesystem (or a cluster one) wants to use S_NOSEC, it'll need to set MS_NOSEC in sb->s_flags *AND* take care to clear S_NOSEC when inode attribute changes are picked from other clients. It's not an earth-shattering hole (anybody that can set suid on another client will almost certainly be able to write to the file before doing that anyway), but it's a bug that needs fixing. Signed-off-by: Al Viro --- fs/btrfs/super.c | 2 +- fs/fuse/inode.c | 2 ++ fs/ocfs2/super.c | 2 +- fs/super.c | 2 +- include/linux/fs.h | 3 ++- mm/filemap.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b2e7e5bc3e..d158b672a2d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -819,7 +819,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc6ec4b2f0f..38f84cd48b6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -921,6 +921,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; + sb->s_flags &= ~MS_NOSEC; + if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cdbaf5e9730..56f61027236 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1072,7 +1072,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, diff --git a/fs/super.c b/fs/super.c index c75593953c5..ab3d672db0d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -822,7 +822,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); diff --git a/include/linux/fs.h b/include/linux/fs.h index c55d6b7cd5d..646a1836152 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -208,6 +208,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -2591,7 +2592,7 @@ static inline int is_sxid(mode_t mode) static inline void inode_has_no_xattr(struct inode *inode) { - if (!is_sxid(inode->i_mode)) + if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; } diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64..a8251a8d345 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); - if (!error) + if (!error && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; return error; -- cgit v1.2.3-70-g09d2 From 1bc8779349d6278e2713a1ff94418c2a6746a791 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 28 May 2011 21:57:55 +0200 Subject: btrfs: scrub: don't reuse bios and pages The current scrub implementation reuses bios and pages as often as possible, allocating them only on start and releasing them when finished. This leads to more problems with the block layer than it's worth. The elevator gets confused when there are more pages added to the bio than bi_size suggests. This patch completely rips out the reuse of bios and pages and allocates them freshly for each submit. Signed-off-by: Arne Jansen Signed-off-by: Chris Maosn --- fs/btrfs/scrub.c | 114 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6dfed0c27ac..2d1f8909a8e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } +static void scrub_free_bio(struct bio *bio) +{ + int i; + struct page *last_page = NULL; + + if (!bio) + return; + + for (i = 0; i < bio->bi_vcnt; ++i) { + if (bio->bi_io_vec[i].bv_page == last_page) + continue; + last_page = bio->bi_io_vec[i].bv_page; + __free_page(last_page); + } + bio_put(bio); +} + static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; - int j; - struct page *last_page; if (!sdev) return; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; - struct bio *bio; if (!sbio) break; - bio = sbio->bio; - if (bio) { - last_page = NULL; - for (j = 0; j < bio->bi_vcnt; ++j) { - if (bio->bi_io_vec[j].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[j].bv_page; - __free_page(last_page); - } - bio_put(bio); - } + scrub_free_bio(sbio->bio); kfree(sbio); } @@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) { struct scrub_dev *sdev; int i; - int j; - int ret; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; sdev = kzalloc(sizeof(*sdev), GFP_NOFS); @@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) goto nomem; sdev->dev = dev; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct bio *bio; struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); @@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) goto nomem; sdev->bios[i] = sbio; - bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - goto nomem; - sbio->index = i; sbio->sdev = sdev; - sbio->bio = bio; sbio->count = 0; sbio->work.func = scrub_checksum; - bio->bi_private = sdev->bios[i]; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_sector = 0; - bio->bi_bdev = dev->bdev; - bio->bi_size = 0; - - for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) { - struct page *page; - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) - goto nomem; - } - WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO); if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -394,6 +373,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; sbio->err = err; + sbio->bio = bio; btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); } @@ -453,6 +433,8 @@ static void scrub_checksum(struct btrfs_work *work) } out: + scrub_free_bio(sbio->bio); + sbio->bio = NULL; spin_lock(&sdev->list_lock); sbio->next_free = sdev->first_free; sdev->first_free = sbio->index; @@ -583,25 +565,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; + struct bio *bio; + int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - sbio->bio->bi_sector = sbio->physical >> 9; - sbio->bio->bi_size = sbio->count * PAGE_SIZE; - sbio->bio->bi_next = NULL; - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_comp_cpu = -1; - sbio->bio->bi_bdev = sdev->dev->bdev; + bio = bio_alloc(GFP_NOFS, sbio->count); + if (!bio) + goto nomem; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + + for (i = 0; i < sbio->count; ++i) { + struct page *page; + int ret; + + page = alloc_page(GFP_NOFS); + if (!page) + goto nomem; + + ret = bio_add_page(bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + goto nomem; + } + } + sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(0, sbio->bio); + submit_bio(READ, bio); return 0; + +nomem: + scrub_free_bio(bio); + + return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -633,7 +640,11 @@ again: sbio->logical = logical; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - scrub_submit(sdev); + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; goto again; } sbio->spag[sbio->count].flags = flags; @@ -645,8 +656,13 @@ again: memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); } ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) - scrub_submit(sdev); + if (sbio->count == SCRUB_PAGES_PER_BIO || force) { + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; + } return 0; } -- cgit v1.2.3-70-g09d2 From 17aca1c987cff89dc4279371857035da902c8854 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 01:13:45 -0400 Subject: Btrfs: fix uninit variable in the delayed inode code The nitems counter needs to start at zero Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b46d94d1dea..c61c32cf0f7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&head); next = item; + nitems = 0; /* * count the number of the continuous items that we can insert in batch -- cgit v1.2.3-70-g09d2 From 211f96c24f117fcc6e9e2431e40d92f4de22625e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 01:26:53 -0400 Subject: Btrfs: make sure we don't overflow the free space cache crc page The free space cache uses only one page for crcs right now, which means we can't have a cache file bigger than the crcs we can fit in the first page. This adds a check to enforce that restriction. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dd38d4c3a59..1cb72394498 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -590,10 +590,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + /* Since the first page has all of our checksums and our generation we + * need to calculate the offset into the page that we can start writing + * our entries. + */ + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); + filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); + /* make sure we don't overflow that first page */ + if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { + /* this is really the same as running out of space, where we also return 0 */ + printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); + ret = 0; + goto out_update; + } + /* We need a checksum per page. */ crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); if (!crc) @@ -605,12 +620,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return -1; } - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) cluster = list_entry(block_group->cluster_list.next, @@ -872,12 +881,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = 1; out_free: + kfree(checksums); + kfree(pages); + +out_update: if (ret != 1) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; } - kfree(checksums); - kfree(pages); btrfs_update_inode(trans, root, inode); return ret; } -- cgit v1.2.3-70-g09d2 From ca456ae280c0646e1e571c3b9a3834c55e90adfe Mon Sep 17 00:00:00 2001 From: liubo Date: Wed, 1 Jun 2011 09:42:49 +0000 Subject: Btrfs: don't save the inode cache in non-FS roots This adds extra checks to make sure the inode map we are caching really belongs to a FS root instead of a special relocation tree. It prevents crashes during balancing operations. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 3262cd17a12..04f7199facb 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -388,6 +388,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, int prealloc; bool retry = false; + /* only fs tree and subvol/snap needs ino cache */ + if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && + (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || + root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 5f3f302a6f4cb74906c05fad1d03fc5e95c7e5af Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 30 May 2011 08:36:16 +0000 Subject: btrfs: false BUG_ON when degraded In degraded mode the struct btrfs_device of missing devs don't have device->name set. A kstrdup of NULL correctly returns NULL. Don't BUG in this case. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c48214ef5c0..da541dfca2e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) BUG_ON(!new_device); memcpy(new_device, device, sizeof(*new_device)); new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(!new_device->name); + BUG_ON(device->name && !new_device->name); new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; -- cgit v1.2.3-70-g09d2 From d132a538d258f8f52fd0cd8b5017755f4e915386 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 May 2011 19:33:33 +0000 Subject: Btrfs: don't save the inode cache if we are deleting this root With xfstest 254 I can panic the box every time with the inode number caching stuff on. This is because we clean the inodes out when we delete the subvolume, but then we write out the inode cache which adds an inode to the subvolume inode tree, and then when it gets evicted again the root gets added back on the dead roots list and is deleted again, so we have a double free. To stop this from happening just return 0 if refs is 0 (and we're not the tree root since tree root always has refs of 0). With this fix 254 no longer panics. Thanks, Signed-off-by: Josef Bacik Tested-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 04f7199facb..2d0d50067a7 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -394,6 +394,11 @@ int btrfs_save_ino_cache(struct btrfs_root *root, root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) return 0; + /* Don't save inode cache if we are deleting this root */ + if (btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From a4689d2bd3b00dcf5c4320f06e0ab88810fbff9c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 31 May 2011 17:08:14 +0000 Subject: btrfs: use btrfs_ino to access inode number commit 4cb5300bc ("Btrfs: add mount -o auto_defrag") accesses inode number directly while it should use the helper with the new inode number allocator. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- fs/btrfs/ioctl.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e3a1b0c2394..982b5ea9762 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!defrag) return -ENOMEM; - defrag->ino = inode->i_ino; + defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 74c80595d70..ac37040e426 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -706,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root, struct btrfs_file_extent_item *extent; int type; int ret; + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - min_key.objectid = inode->i_ino; + min_key.objectid = ino; min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - max_key.objectid = inode->i_ino; + max_key.objectid = ino; max_key.type = (u8)-1; max_key.offset = (u64)-1; @@ -726,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root, path, 0, newer_than); if (ret != 0) goto none; - if (min_key.objectid != inode->i_ino) + if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) goto none; -- cgit v1.2.3-70-g09d2 From e7786c3ae517b2c433edc91714e86be770e9f1ce Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 28 May 2011 20:58:38 +0000 Subject: btrfs: scrub: add explicit plugging With the removal of the implicit plugging scrub ends up doing more and smaller I/O than necessary. This patch adds explicit plugging per chunk. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2d1f8909a8e..1204eab9402 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -348,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, int ret; DECLARE_COMPLETION_ONSTACK(complete); - /* we are going to wait on this IO */ - rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = bdev; bio->bi_sector = sector; @@ -359,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio->bi_private = &complete; submit_bio(rw, bio); + /* this will also unplug the queue */ wait_for_completion(&complete); ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -743,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; + struct blk_plug plug; u64 flags; int ret; int slot; @@ -847,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, * the scrub. This might currently (crc32) end up to be about 1MB */ start_stripe = 0; + blk_start_plug(&plug); again: logical = base + offset + start_stripe * increment; for (i = start_stripe; i < nstripes; ++i) { @@ -988,6 +988,7 @@ next: scrub_submit(sdev); out: + blk_finish_plug(&plug); btrfs_free_path(path); return ret < 0 ? ret : 0; } -- cgit v1.2.3-70-g09d2 From 4b9465cb9e3859186eefa1ca3b990a5849386320 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 09:36:29 -0400 Subject: Btrfs: add mount -o inode_cache This makes the inode map cache default to off until we fix the overflow problem when the free space crcs don't fit inside a single page. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/free-space-cache.c | 6 ++++++ fs/btrfs/inode-map.c | 20 ++++++++++++++++++++ fs/btrfs/super.c | 8 +++++++- 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f98c200571..4958ef5417d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1cb72394498..bffa5c4a633 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2536,6 +2536,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + /* * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. @@ -2575,6 +2578,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct inode *inode; int ret; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode)) return 0; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2d0d50067a7..cb79b8975c9 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,6 +38,9 @@ static int caching_kthread(void *data) int slot; int ret; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -141,6 +144,9 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + spin_lock(&root->cache_lock); if (root->cached != BTRFS_CACHE_NO) { spin_unlock(&root->cache_lock); @@ -178,6 +184,9 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return btrfs_find_free_objectid(root, objectid); + again: *objectid = btrfs_find_ino_for_alloc(root); @@ -201,6 +210,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + again: if (root->cached == BTRFS_CACHE_FINISHED) { __btrfs_add_free_space(ctl, objectid, 1); @@ -250,6 +263,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + while (1) { n = rb_first(rbroot); if (!n) @@ -399,9 +415,13 @@ int btrfs_save_ino_cache(struct btrfs_root *root, root != root->fs_info->tree_root) return 0; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; + again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 28e3cb2607f..3559d0b3518 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -160,7 +160,8 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, + Opt_inode_cache, Opt_err, }; static match_table_t tokens = { @@ -192,6 +193,7 @@ static match_table_t tokens = { {Opt_enospc_debug, "enospc_debug"}, {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, + {Opt_inode_cache, "inode_cache"}, {Opt_err, NULL}, }; @@ -360,6 +362,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_inode_cache: + printk(KERN_INFO "btrfs: enabling inode map caching\n"); + btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + break; case Opt_clear_cache: printk(KERN_INFO "btrfs: force clearing of disk cache\n"); btrfs_set_opt(info->mount_opt, CLEAR_CACHE); -- cgit v1.2.3-70-g09d2 From 7841cb2898f66a73062c64d0ef5733dde7279e46 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 31 May 2011 18:07:27 +0200 Subject: btrfs: add helper for fs_info->closing wrap checking of filesystem 'closing' flag and fix a few missing memory barriers. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 3 +-- fs/btrfs/file.c | 4 ++-- fs/btrfs/free-space-cache.c | 10 ++++------ fs/btrfs/inode-map.c | 3 +-- fs/btrfs/inode.c | 3 +-- fs/btrfs/scrub.c | 2 +- fs/btrfs/transaction.c | 2 +- 8 files changed, 20 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4958ef5417d..8490ee06370 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2354,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c9173a7827b..5b9b6b6df24 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -366,8 +366,7 @@ again: nritems = btrfs_header_nritems(leaf); while (1) { - smp_mb(); - if (fs_info->closing > 1) { + if (btrfs_fs_closing(fs_info) > 1) { last = (u64)-1; break; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 982b5ea9762..fa4ef18b66b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!btrfs_test_opt(root, AUTO_DEFRAG)) return 0; - if (root->fs_info->closing) + if (btrfs_fs_closing(root->fs_info)) return 0; if (BTRFS_I(inode)->in_defrag) @@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) goto next_free; spin_unlock(&fs_info->defrag_inodes_lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bffa5c4a633..ad144736a5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (!root->fs_info->closing) { + if (!btrfs_fs_closing(root->fs_info)) { block_group->inode = igrab(inode); block_group->iref = 1; } @@ -493,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. */ - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) return 0; /* @@ -2513,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, return inode; spin_lock(&root->cache_lock); - if (!root->fs_info->closing) + if (!btrfs_fs_closing(root->fs_info)) root->cache_inode = igrab(inode); spin_unlock(&root->cache_lock); @@ -2543,8 +2542,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. */ - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) return 0; path = btrfs_alloc_path(); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cb79b8975c9..b4087e0fa87 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -62,8 +62,7 @@ again: goto out; while (1) { - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) goto out; leaf = path->nodes[0]; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a83e44bf320..02ff4a1b968 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4266,8 +4266,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (BTRFS_I(inode)->dummy_inode) return 0; - smp_mb(); - if (root->fs_info->closing && is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1204eab9402..df50fd1eca8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1183,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, int ret; struct btrfs_device *dev; - if (root->fs_info->closing) + if (btrfs_fs_closing(root->fs_info)) return -EINVAL; /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2d5c6d2aa4e..dd719662340 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -817,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) btrfs_btree_balance_dirty(info->tree_root, nr); cond_resched(); - if (root->fs_info->closing || ret != -EAGAIN) + if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; } root->defrag_running = 0; -- cgit v1.2.3-70-g09d2 From aa0467d8d2a00e75b2bb6a56a4ee6d70c5d1928f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 3 Jun 2011 16:29:08 +0200 Subject: btrfs: fix uninitialized variable warning With Linus' tree, today's linux-next build (powercp ppc64_defconfig) produced this warning: fs/btrfs/delayed-inode.c: In function 'btrfs_delayed_update_inode': fs/btrfs/delayed-inode.c:1598:6: warning: 'ret' may be used uninitialized in this function Introduced by commit 16cdcec736cd ("btrfs: implement delayed inode items operation"). This fixes a bug in btrfs_update_inode(): if the returned value from btrfs_delayed_update_inode is a nonzero garbage, inode stat data are not updated and several call paths may hit a BUG_ON or fail with strange code. Reported-by: Stephen Rothwell Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c61c32cf0f7..6462c29d2d3 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_delayed_node *delayed_node; - int ret; + int ret = 0; delayed_node = btrfs_get_or_create_delayed_node(inode); if (IS_ERR(delayed_node)) -- cgit v1.2.3-70-g09d2 From 5def1360252b974faeb438775c19c14338bc1903 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sun, 5 Jun 2011 23:26:40 -0400 Subject: ext4: correct comments for ext4_free_blocks() metadata is not parameter of ext4_free_blocks() any more. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 859f2ae8864..eb71dd59f2e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4448,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * @inode: inode * @block: start physical block to free * @count: number of blocks to count - * @metadata: Are these metadata blocks + * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, -- cgit v1.2.3-70-g09d2 From f17722f917b2f21497deb6edc62fb1683daa08e6 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 6 Jun 2011 00:05:17 -0400 Subject: ext4: Fix max file size and logical block counting of extent format file Kazuya Mio reported that he was able to hit BUG_ON(next == lblock) in ext4_ext_put_gap_in_cache() while creating a sparse file in extent format and fill the tail of file up to its end. We will hit the BUG_ON when we write the last block (2^32-1) into the sparse file. The root cause of the problem lies in the fact that we specifically set s_maxbytes so that block at s_maxbytes fit into on-disk extent format, which is 32 bit long. However, we are not storing start and end block number, but rather start block number and length in blocks. It means that in order to cover extent from 0 to EXT_MAX_BLOCK we need EXT_MAX_BLOCK+1 to fit into len (because we counting block 0 as well) - and it does not. The only way to fix it without changing the meaning of the struct ext4_extent members is, as Kazuya Mio suggested, to lower s_maxbytes by one fs block so we can cover the whole extent we can get by the on-disk extent format. Also in many places EXT_MAX_BLOCK is used as length instead of maximum logical block number as the name suggests, it is all a bit messy. So this commit renames it to EXT_MAX_BLOCKS and change its usage in some places to actually be maximum number of blocks in the extent. The bug which this commit fixes can be reproduced as follows: dd if=/dev/zero of=/mnt/mp1/file bs= count=1 seek=$((2**32-2)) sync dd if=/dev/zero of=/mnt/mp1/file bs= count=1 seek=$((2**32-1)) Reported-by: Kazuya Mio Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 7 +++++-- fs/ext4/extents.c | 34 +++++++++++++++++----------------- fs/ext4/move_extent.c | 10 +++++----- fs/ext4/super.c | 15 ++++++++++++--- 4 files changed, 39 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 2e29abb30f7..47641464438 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -133,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, #define EXT_BREAK 1 #define EXT_REPEAT 2 -/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */ -#define EXT_MAX_BLOCK 0xffffffff +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5199bac7fc6..41575704600 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1408,7 +1408,7 @@ got_index: /* * ext4_ext_next_allocated_block: - * returns allocated block in subsequent extent or EXT_MAX_BLOCK. + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. @@ -1422,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; while (depth >= 0) { if (depth == path->p_depth) { @@ -1439,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) depth--; } - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: - * returns first allocated block from next leaf or EXT_MAX_BLOCK + * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, struct ext4_ext_path *path) @@ -1456,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, /* zero-tree has no leaf blocks at all */ if (depth == 0) - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; /* go to index block */ depth--; @@ -1469,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, depth--; } - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; } /* @@ -1677,13 +1677,13 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); - if (b2 == EXT_MAX_BLOCK) + if (b2 == EXT_MAX_BLOCKS) goto out; } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { - len1 = EXT_MAX_BLOCK - b1; + len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } @@ -1767,7 +1767,7 @@ repeat: fex = EXT_LAST_EXTENT(eh); next = ext4_ext_next_leaf_block(inode, path); if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCK) { + && next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); @@ -1887,7 +1887,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, BUG_ON(func == NULL); BUG_ON(inode == NULL); - while (block < last && block != EXT_MAX_BLOCK) { + while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); @@ -2020,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ lblock = 0; - len = EXT_MAX_BLOCK; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2350,7 +2350,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * never happen because at least one of the end points * needs to be on the edge of the extent. */ - if (end == EXT_MAX_BLOCK) { + if (end == EXT_MAX_BLOCKS - 1) { ext_debug(" bad truncate %u:%u\n", start, end); block = 0; @@ -2398,7 +2398,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If this is a truncate, this condition * should never happen */ - if (end == EXT_MAX_BLOCK) { + if (end == EXT_MAX_BLOCKS - 1) { ext_debug(" bad truncate %u:%u\n", start, end); err = -EIO; @@ -2478,7 +2478,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * we need to remove it from the leaf */ if (num == 0) { - if (end != EXT_MAX_BLOCK) { + if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that @@ -3699,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -4347,8 +4347,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCK) - last_blk = EXT_MAX_BLOCK-1; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2b8304bf3c5..f57455a1b1b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1002,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if ((orig_start > EXT_MAX_BLOCK) || - (donor_start > EXT_MAX_BLOCK) || - (*len > EXT_MAX_BLOCK) || - (orig_start + *len > EXT_MAX_BLOCK)) { + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK, + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cc5c157aa11..9ea71aa864b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2243,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) @@ -2264,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) upper_limit <<= blkbits; } - /* 32-bit extent-start container, ee_block */ - res = 1LL << 32; + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; res <<= blkbits; - res -= 1; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) -- cgit v1.2.3-70-g09d2 From c03f8aa9abdd517477c2021ea1251939b4da49e6 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 6 Jun 2011 00:06:52 -0400 Subject: ext4: use FIEMAP_EXTENT_LAST flag for last extent in fiemap Currently we are not marking the extent as the last one (FIEMAP_EXTENT_LAST) if there is a hole at the end of the file. This is because we just do not check for it right now and continue searching for next extent. But at the point we hit the hole at the end of the file, it is too late. This commit adds check for the allocated block in subsequent extent and if there is no more extents (block = EXT_MAX_BLOCKS) just flag the current one as the last one. This behaviour has been spotted unintentionally by 252 xfstest, when the test hangs out, because of wrong loop condition. However on other filesystems (like xfs) it will exit anyway, because we notice the last extent flag and exit. With this patch xfstest 252 does not hang anymore, ext4 fiemap implementation still reports bad extent type in some cases, however this seems to be different issue. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 2 +- fs/ext4/extents.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 47641464438..095c36f3b61 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -125,7 +125,7 @@ struct ext4_ext_path { * positive retcode - signal for ext4_ext_walk_space(), see below * callback must return valid extent (passed or newly created) */ -typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, struct ext4_ext_cache *, struct ext4_extent *, void *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 41575704600..f815cc81e7a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1958,7 +1958,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, err = -EIO; break; } - err = func(inode, path, &cbex, ex, cbdata); + err = func(inode, next, &cbex, ex, cbdata); ext4_ext_drop_refs(path); if (err < 0) @@ -3914,14 +3914,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, /* * Callback function called for each extent to gather FIEMAP information. */ -static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { __u64 logical; __u64 physical; __u64 length; - loff_t size; __u32 flags = 0; int ret = 0; struct fiemap_extent_info *fieinfo = data; @@ -4103,8 +4102,7 @@ found_delayed_extent: if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; - size = i_size_read(inode); - if (logical + length >= size) + if (next == EXT_MAX_BLOCKS) flags |= FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, physical, -- cgit v1.2.3-70-g09d2 From a9c667f8f0656631ee5438baaf21bf30d5f67375 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 6 Jun 2011 09:51:52 -0400 Subject: ext4: fixed tracepoints cleanup While creating fixed tracepoints for ext3, basically by porting them from ext4, I found a lot of useless retyping, wrong type usage, useless variable passing and other inconsistencies in the ext4 fixed tracepoint code. This patch cleans the fixed tracepoint code for ext4 and also simplify some of them. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 6 +- include/trace/events/ext4.h | 179 +++++++++++++++++++------------------------- 3 files changed, 80 insertions(+), 107 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a5763e3505b..e3126c05100 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2634,7 +2634,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; - trace_ext4_writepage(inode, page); + trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index eb71dd59f2e..6ed859d5685 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3578,8 +3578,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, - grp_blk_start + bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3608,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(sb, pa); + trace_ext4_mb_release_group_pa(pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index e09592d2f91..5ce2b2f5f52 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -26,7 +26,7 @@ TRACE_EVENT(ext4_free_inode, __field( umode_t, mode ) __field( uid_t, uid ) __field( gid_t, gid ) - __field( blkcnt_t, blocks ) + __field( __u64, blocks ) ), TP_fast_assign( @@ -40,9 +40,8 @@ TRACE_EVENT(ext4_free_inode, TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->mode, __entry->uid, __entry->gid, - (unsigned long long) __entry->blocks) + (unsigned long) __entry->ino, __entry->mode, + __entry->uid, __entry->gid, __entry->blocks) ); TRACE_EVENT(ext4_request_inode, @@ -178,7 +177,7 @@ TRACE_EVENT(ext4_begin_ordered_truncate, TP_printk("dev %d,%d ino %lu new_size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (long long) __entry->new_size) + __entry->new_size) ); DECLARE_EVENT_CLASS(ext4__write_begin, @@ -204,7 +203,7 @@ DECLARE_EVENT_CLASS(ext4__write_begin, __entry->flags = flags; ), - TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u", + TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->flags) @@ -248,7 +247,7 @@ DECLARE_EVENT_CLASS(ext4__write_end, __entry->copied = copied; ), - TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u", + TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->copied) @@ -286,29 +285,6 @@ DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_ARGS(inode, pos, len, copied) ); -TRACE_EVENT(ext4_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %d,%d ino %lu page_index %lu", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->index) -); - TRACE_EVENT(ext4_da_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc), @@ -341,7 +317,7 @@ TRACE_EVENT(ext4_da_writepages, ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " - "range_start %llu range_end %llu sync_mode %d" + "range_start %lld range_end %lld sync_mode %d" "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, @@ -449,7 +425,14 @@ DECLARE_EVENT_CLASS(ext4__page_op, TP_printk("dev %d,%d ino %lu page_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->index) + (unsigned long) __entry->index) +); + +DEFINE_EVENT(ext4__page_op, ext4_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) ); DEFINE_EVENT(ext4__page_op, ext4_readpage, @@ -489,7 +472,7 @@ TRACE_EVENT(ext4_invalidatepage, TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->index, __entry->offset) + (unsigned long) __entry->index, __entry->offset) ); TRACE_EVENT(ext4_discard_blocks, @@ -562,12 +545,10 @@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, ); TRACE_EVENT(ext4_mb_release_inode_pa, - TP_PROTO(struct super_block *sb, - struct inode *inode, - struct ext4_prealloc_space *pa, + TP_PROTO(struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), - TP_ARGS(sb, inode, pa, block, count), + TP_ARGS(pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) @@ -578,8 +559,8 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->ino = inode->i_ino; + __entry->dev = pa->pa_inode->i_sb->s_dev; + __entry->ino = pa->pa_inode->i_ino; __entry->block = block; __entry->count = count; ), @@ -591,10 +572,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ); TRACE_EVENT(ext4_mb_release_group_pa, - TP_PROTO(struct super_block *sb, - struct ext4_prealloc_space *pa), + TP_PROTO(struct ext4_prealloc_space *pa), - TP_ARGS(sb, pa), + TP_ARGS(pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -604,7 +584,7 @@ TRACE_EVENT(ext4_mb_release_group_pa, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = pa->pa_inode->i_sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), @@ -666,10 +646,10 @@ TRACE_EVENT(ext4_request_blocks, __field( ino_t, ino ) __field( unsigned int, flags ) __field( unsigned int, len ) - __field( __u64, logical ) + __field( __u32, logical ) + __field( __u32, lleft ) + __field( __u32, lright ) __field( __u64, goal ) - __field( __u64, lleft ) - __field( __u64, lright ) __field( __u64, pleft ) __field( __u64, pright ) ), @@ -687,17 +667,13 @@ TRACE_EVENT(ext4_request_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu " - "lleft %llu lright %llu pleft %llu pright %llu ", + TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " + "lleft %u lright %u pleft %llu pright %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->flags, __entry->len, - (unsigned long long) __entry->logical, - (unsigned long long) __entry->goal, - (unsigned long long) __entry->lleft, - (unsigned long long) __entry->lright, - (unsigned long long) __entry->pleft, - (unsigned long long) __entry->pright) + (unsigned long) __entry->ino, __entry->flags, + __entry->len, __entry->logical, __entry->goal, + __entry->lleft, __entry->lright, __entry->pleft, + __entry->pright) ); TRACE_EVENT(ext4_allocate_blocks, @@ -711,10 +687,10 @@ TRACE_EVENT(ext4_allocate_blocks, __field( __u64, block ) __field( unsigned int, flags ) __field( unsigned int, len ) - __field( __u64, logical ) + __field( __u32, logical ) + __field( __u32, lleft ) + __field( __u32, lright ) __field( __u64, goal ) - __field( __u64, lleft ) - __field( __u64, lright ) __field( __u64, pleft ) __field( __u64, pright ) ), @@ -733,17 +709,13 @@ TRACE_EVENT(ext4_allocate_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu " - "goal %llu lleft %llu lright %llu pleft %llu pright %llu", + TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " + "goal %llu lleft %u lright %u pleft %llu pright %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->flags, __entry->len, __entry->block, - (unsigned long long) __entry->logical, - (unsigned long long) __entry->goal, - (unsigned long long) __entry->lleft, - (unsigned long long) __entry->lright, - (unsigned long long) __entry->pleft, - (unsigned long long) __entry->pright) + (unsigned long) __entry->ino, __entry->flags, + __entry->len, __entry->block, __entry->logical, + __entry->goal, __entry->lleft, __entry->lright, + __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_free_blocks, @@ -755,10 +727,10 @@ TRACE_EVENT(ext4_free_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( umode_t, mode ) __field( __u64, block ) __field( unsigned long, count ) - __field( int, flags ) + __field( int, flags ) ), TP_fast_assign( @@ -798,7 +770,7 @@ TRACE_EVENT(ext4_sync_file_enter, __entry->parent = dentry->d_parent->d_inode->i_ino; ), - TP_printk("dev %d,%d ino %ld parent %ld datasync %d ", + TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) @@ -821,7 +793,7 @@ TRACE_EVENT(ext4_sync_file_exit, __entry->dev = inode->i_sb->s_dev; ), - TP_printk("dev %d,%d ino %ld ret %d", + TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) @@ -1005,7 +977,7 @@ DECLARE_EVENT_CLASS(ext4__mballoc, __entry->result_len = len; ), - TP_printk("dev %d,%d inode %lu extent %u/%d/%u ", + TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, @@ -1093,7 +1065,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, "allocated_meta_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) ); @@ -1127,7 +1099,7 @@ TRACE_EVENT(ext4_da_reserve_space, "reserved_data_blocks %d reserved_meta_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->mode, __entry->i_blocks, __entry->md_needed, __entry->reserved_data_blocks, __entry->reserved_meta_blocks) ); @@ -1164,7 +1136,7 @@ TRACE_EVENT(ext4_da_release_space, "allocated_meta_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->mode, __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) ); @@ -1239,14 +1211,15 @@ TRACE_EVENT(ext4_direct_IO_enter, __entry->rw = rw; ), - TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d", + TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (unsigned long long) __entry->pos, __entry->len, __entry->rw) + __entry->pos, __entry->len, __entry->rw) ); TRACE_EVENT(ext4_direct_IO_exit, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret), + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, + int rw, int ret), TP_ARGS(inode, offset, len, rw, ret), @@ -1268,10 +1241,10 @@ TRACE_EVENT(ext4_direct_IO_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d", + TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (unsigned long long) __entry->pos, __entry->len, + __entry->pos, __entry->len, __entry->rw, __entry->ret) ); @@ -1296,15 +1269,15 @@ TRACE_EVENT(ext4_fallocate_enter, __entry->mode = mode; ), - TP_printk("dev %d,%d ino %ld pos %llu len %llu mode %d", + TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long long) __entry->pos, - (unsigned long long) __entry->len, __entry->mode) + (unsigned long) __entry->ino, __entry->pos, + __entry->len, __entry->mode) ); TRACE_EVENT(ext4_fallocate_exit, - TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret), + TP_PROTO(struct inode *inode, loff_t offset, + unsigned int max_blocks, int ret), TP_ARGS(inode, offset, max_blocks, ret), @@ -1312,7 +1285,7 @@ TRACE_EVENT(ext4_fallocate_exit, __field( ino_t, ino ) __field( dev_t, dev ) __field( loff_t, pos ) - __field( unsigned, blocks ) + __field( unsigned int, blocks ) __field( int, ret ) ), @@ -1324,10 +1297,10 @@ TRACE_EVENT(ext4_fallocate_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %ld pos %llu blocks %d ret %d", + TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (unsigned long long) __entry->pos, __entry->blocks, + __entry->pos, __entry->blocks, __entry->ret) ); @@ -1350,7 +1323,7 @@ TRACE_EVENT(ext4_unlink_enter, __entry->dev = dentry->d_inode->i_sb->s_dev; ), - TP_printk("dev %d,%d ino %ld size %lld parent %ld", + TP_printk("dev %d,%d ino %lu size %lld parent %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->size, (unsigned long) __entry->parent) @@ -1373,7 +1346,7 @@ TRACE_EVENT(ext4_unlink_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %ld ret %d", + TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) @@ -1387,7 +1360,7 @@ DECLARE_EVENT_CLASS(ext4__truncate, TP_STRUCT__entry( __field( ino_t, ino ) __field( dev_t, dev ) - __field( blkcnt_t, blocks ) + __field( __u64, blocks ) ), TP_fast_assign( @@ -1396,9 +1369,9 @@ DECLARE_EVENT_CLASS(ext4__truncate, __entry->blocks = inode->i_blocks; ), - TP_printk("dev %d,%d ino %lu blocks %lu", + TP_printk("dev %d,%d ino %lu blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, (unsigned long) __entry->blocks) + (unsigned long) __entry->ino, __entry->blocks) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, @@ -1417,7 +1390,7 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - unsigned len, unsigned flags), + unsigned int len, unsigned int flags), TP_ARGS(inode, lblk, len, flags), @@ -1425,8 +1398,8 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter, __field( ino_t, ino ) __field( dev_t, dev ) __field( ext4_lblk_t, lblk ) - __field( unsigned, len ) - __field( unsigned, flags ) + __field( unsigned int, len ) + __field( unsigned int, flags ) ), TP_fast_assign( @@ -1440,7 +1413,7 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (unsigned) __entry->lblk, __entry->len, __entry->flags) + __entry->lblk, __entry->len, __entry->flags) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, @@ -1459,7 +1432,7 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + ext4_fsblk_t pblk, unsigned int len, int ret), TP_ARGS(inode, lblk, pblk, len, ret), @@ -1468,7 +1441,7 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, __field( dev_t, dev ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) - __field( unsigned, len ) + __field( unsigned int, len ) __field( int, ret ) ), @@ -1484,7 +1457,7 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, + __entry->lblk, __entry->pblk, __entry->len, __entry->ret) ); @@ -1524,7 +1497,7 @@ TRACE_EVENT(ext4_ext_load_extent, TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - (unsigned) __entry->lblk, (unsigned long long) __entry->pblk) + __entry->lblk, __entry->pblk) ); TRACE_EVENT(ext4_load_inode, -- cgit v1.2.3-70-g09d2 From 957df4535d06a8e009101239937ca5e50a6218c6 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 6 Jun 2011 11:33:12 +0400 Subject: possible memory corruption in cifs_parse_mount_options() error path after mountdata check frees uninitialized mountdata_copy Signed-off-by: Vasily Averin Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 84c730701c0..fb31c2c1bb1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -784,7 +784,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) { char *value, *data, *end; - char *mountdata_copy, *options; + char *mountdata_copy = NULL, *options; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; -- cgit v1.2.3-70-g09d2 From 243e2dd38e1b322b29a1714034cc60b84d3d5e07 Mon Sep 17 00:00:00 2001 From: Darren Salt Date: Mon, 6 Jun 2011 16:58:16 +0000 Subject: CIFS ACL support needs CONFIG_KEYS, so depend on it Build fails if CONFIG_KEYS is not selected. Signed-off-by: Darren Salt Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 66c68ab9e7d..53ed1ad2c11 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -149,7 +149,7 @@ config CIFS_FSCACHE config CIFS_ACL bool "Provide CIFS ACL support (EXPERIMENTAL)" - depends on EXPERIMENTAL && CIFS_XATTR + depends on EXPERIMENTAL && CIFS_XATTR && KEYS help Allows to fetch CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. -- cgit v1.2.3-70-g09d2 From b084f598df36b62dfae83c10ed17f0b66b50f442 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 31 May 2011 12:24:58 -0400 Subject: nfsd: fix dependency of nfsd on auth_rpcgss Commit b0b0c0a26e84 "nfsd: add proc file listing kernel's gss_krb5 enctypes" added an nunnecessary dependency of nfsd on the auth_rpcgss module. It's a little ad hoc, but since the only piece of information nfsd needs from rpcsec_gss_krb5 is a single static string, one solution is just to share it with an include file. Cc: stable@kernel.org Reported-by: Michael Guntsche Cc: Kevin Coffman Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 19 ++++++------------- include/linux/sunrpc/gss_krb5_enctypes.h | 4 ++++ net/sunrpc/auth_gss/gss_krb5_mech.c | 3 ++- 3 files changed, 12 insertions(+), 14 deletions(-) create mode 100644 include/linux/sunrpc/gss_krb5_enctypes.h (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1f5eae40f34..2b1449dd2f4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "idmap.h" #include "nfsd.h" @@ -189,18 +190,10 @@ static struct file_operations export_features_operations = { .release = single_release, }; -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) { - struct gss_api_mech *k5mech; - - k5mech = gss_mech_get_by_name("krb5"); - if (k5mech == NULL) - goto out; - if (k5mech->gm_upcall_enctypes != NULL) - seq_printf(m, k5mech->gm_upcall_enctypes); - gss_mech_put(k5mech); -out: + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); return 0; } @@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, diff --git a/include/linux/sunrpc/gss_krb5_enctypes.h b/include/linux/sunrpc/gss_krb5_enctypes.h new file mode 100644 index 00000000000..ec6234eee89 --- /dev/null +++ b/include/linux/sunrpc/gss_krb5_enctypes.h @@ -0,0 +1,4 @@ +/* + * Dumb way to share this static piece of information with nfsd + */ +#define KRB5_SUPPORTED_ENCTYPES "18,17,16,23,3,1,2" diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 0a9a2ec2e46..c3b75333b82 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -750,7 +751,7 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = "18,17,16,23,3,1,2", + .gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES, }; static int __init init_kerberos_module(void) -- cgit v1.2.3-70-g09d2 From be1f4084b4824301e640e81d63b6275cd99ee6a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 6 Jun 2011 11:22:17 -0700 Subject: nfsd: v4 support requires CRYPTO nfsd V4 support uses crypto interfaces, so select CRYPTO to fix build errors in 2.6.39: ERROR: "crypto_destroy_tfm" [fs/nfsd/nfsd.ko] undefined! ERROR: "crypto_alloc_base" [fs/nfsd/nfsd.ko] undefined! Reported-by: Wakko Warner Signed-off-by: Randy Dunlap Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 18b3e8975fe..fbb2a5ef581 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). -- cgit v1.2.3-70-g09d2 From 7d751f6f8c679f51b73d01a1b5269347a929004c Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 3 Jun 2011 12:21:23 -0400 Subject: nfsd: link returns nfserr_delay when breaking lease fix for commit 4795bb37effb7b8fe77e2d2034545d062d3788a8, nfsd: break lease on unlink, link, and rename if the LINK operation breaks a delegation, it returns NFS4ERR_NOENT (which is not a valid error in rfc 5661) instead of NFS4ERR_DELAY. the return value of nfsd_break_lease() in nfsd_link() must be converted from host_err to err Signed-off-by: Casey Bodley Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d5718273bb3..f3fb61b69a1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1660,8 +1660,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (!dold->d_inode) goto out_drop_write; host_err = nfsd_break_lease(dold->d_inode); - if (host_err) + if (host_err) { + err = nfserrno(host_err); goto out_drop_write; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); -- cgit v1.2.3-70-g09d2 From 9c4843ea576107a3c1fb94f2f758f198e9fe9e54 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 6 Jun 2011 15:40:23 -0400 Subject: cifs: silence printk when establishing first session on socket When signing is enabled, the first session that's established on a socket will cause a printk like this to pop: CIFS VFS: Unexpected SMB signature This is because the key exchange hasn't happened yet, so the signature field is bogus. Don't try to check the signature on the socket until the first session has been established. Also, eliminate the specific check for SMB_COM_NEGOTIATE since this check covers that case too. Cc: stable@kernel.org Cc: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index dfbd9f1f373..5a0ee7f2af0 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -184,7 +184,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, if (cifs_pdu == NULL || server == NULL) return -EINVAL; - if (cifs_pdu->Command == SMB_COM_NEGOTIATE) + if (!server->session_estab) return 0; if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { -- cgit v1.2.3-70-g09d2 From 9054760ff585a7fa436599990b63a585ae89ff4d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jun 2011 19:22:56 +0100 Subject: lmLogOpen() broken failure exit Callers of lmLogOpen() expect it to return -E... on failure exits, which is what it returns, except for the case of blkdev_get_by_dev() failure. It that case lmLogOpen() return the error with the wrong sign... Signed-off-by: Al Viro Acked-by: Dave Kleikamp --- fs/jfs/jfs_logmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 278e3fb40b7..583636f745e 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1123,7 +1123,7 @@ int lmLogOpen(struct super_block *sb) bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, log); if (IS_ERR(bdev)) { - rc = -PTR_ERR(bdev); + rc = PTR_ERR(bdev); goto free; } -- cgit v1.2.3-70-g09d2 From e6bc45d65df8599fdbae73be9cec4ceed274db53 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 6 Jun 2011 19:19:40 -0400 Subject: vfs: make unlink() and rmdir() return ENOENT in preference to EROFS If user space attempts to remove a non-existent file or directory, and the file system is mounted read-only, return ENOENT instead of EROFS. Either error code is arguably valid/correct, but ENOENT is a more specific error message. Reported-by: Michael Tokarev Signed-off-by: "Theodore Ts'o" Signed-off-by: Al Viro --- fs/namei.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index e2e4e8d032e..9802345df5e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2624,6 +2624,10 @@ static long do_rmdir(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; + if (!dentry->d_inode) { + error = -ENOENT; + goto exit3; + } error = mnt_want_write(nd.path.mnt); if (error) goto exit3; @@ -2709,11 +2713,10 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) - goto slashes; inode = dentry->d_inode; - if (inode) - ihold(inode); + if (nd.last.name[nd.last.len] || !inode) + goto slashes; + ihold(inode); error = mnt_want_write(nd.path.mnt); if (error) goto exit2; -- cgit v1.2.3-70-g09d2 From 70b666c3b4cb2b96098d80e6f515e4bc6d37db5a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 27 May 2011 09:24:26 -0700 Subject: ceph: use ihold when we already have an inode ref We should use ihold whenever we already have a stable inode ref, even when we aren't holding i_lock. This avoids adding new and unnecessary locking dependencies. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 10 ++++------ fs/ceph/dir.c | 11 +++++++---- fs/ceph/export.c | 4 ++-- fs/ceph/file.c | 3 ++- fs/ceph/inode.c | 18 ++++++++++-------- fs/ceph/ioctl.c | 6 ++++-- fs/ceph/locks.c | 3 ++- fs/ceph/snap.c | 2 +- fs/ceph/xattr.c | 6 ++++-- 10 files changed, 37 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 33da49dc3cc..5a3953db811 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -453,7 +453,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); - igrab(inode); + ihold(inode); err = writepage_nounlock(page, wbc); unlock_page(page); iput(inode); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1f72b00447c..f605753c8fe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2940,14 +2940,12 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->cap_dirty)) { ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = igrab(&ci->vfs_inode); + inode = &ci->vfs_inode; + ihold(inode); dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - if (inode) { - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, - NULL); - iput(inode); - } + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 33729e822bb..ef8f08c343e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -308,7 +308,8 @@ more: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_dentry = dget(filp->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; @@ -787,10 +788,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); - if (err) + if (err) { d_drop(dentry); - else if (!req->r_reply_info.head->is_dentry) - d_instantiate(dentry, igrab(old_dentry->d_inode)); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + } ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index a610d3d6748..f67b687550d 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -109,7 +109,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); @@ -167,7 +167,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(err ? err : -ESTALE); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 203252d88d9..8c5ac4e7283 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -191,7 +191,8 @@ int ceph_open(struct inode *inode, struct file *file) err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, parent_inode, req); if (!err) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 70b6a4839c3..d8858e96ab1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1101,10 +1101,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { - igrab(in); + ihold(in); } else { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, in, ceph_ino(in), ceph_snap(in), @@ -1144,7 +1144,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ } @@ -1328,7 +1328,7 @@ void ceph_queue_writeback(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); } @@ -1353,7 +1353,7 @@ void ceph_queue_invalidate(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); } @@ -1477,7 +1477,7 @@ void ceph_queue_vmtruncate(struct inode *inode) if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); @@ -1738,7 +1738,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) __mark_inode_dirty(inode, inode_dirty_flags); if (mask) { - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; @@ -1779,7 +1780,8 @@ int ceph_do_getattr(struct inode *inode, int mask) req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8888c9ba68d..ef0b5f48e13 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -73,7 +73,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -135,7 +136,8 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 476b329867d..7f0f72c53f7 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -23,7 +23,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 24067d68a55..54b14de2e72 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -722,7 +722,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); inode = &ci->vfs_inode; - igrab(inode); + ihold(inode); spin_unlock(&mdsc->snap_flush_lock); spin_lock(&inode->i_lock); __ceph_flush_snaps(ci, &session, 0); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f2b62869618..f42d730f1b6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -665,7 +665,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_args.setxattr.flags = cpu_to_le32(flags); @@ -795,7 +796,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); -- cgit v1.2.3-70-g09d2 From c3cd62839aaa2cdb2b99687c9e44f1b300a4aece Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 1 Jun 2011 16:08:44 -0700 Subject: ceph: fix short sync reads from the OSD If we get a short read from the OSD because the object is small, we need to zero the remainder of the buffer. For O_DIRECT reads, the attempted range is not trimmed to i_size by the VFS, so we were actually looping indefinitely. Fix by trimming by i_size, and the unconditionally zeroing the trailing range. Reported-by: Jeff Wu Signed-off-by: Sage Weil --- fs/ceph/file.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c5ac4e7283..b654f403139 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -283,7 +283,7 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool align_to_pages, + int *checkeof, bool o_direct, unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -308,7 +308,7 @@ static int striped_read(struct inode *inode, io_align = off & ~PAGE_MASK; more: - if (align_to_pages) + if (o_direct) page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; @@ -346,20 +346,22 @@ more: } if (was_short) { - /* was original extent fully inside i_size? */ - if (pos + left <= inode->i_size) { - dout("zero tail\n"); - ceph_zero_page_vector_range(page_off + read, len - read, + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = 1; + + /* zero trailing bytes (inside i_size) */ + if (left > 0 && pos < inode->i_size) { + if (pos + left > inode->i_size) + left = inode->i_size - pos; + + dout("zero tail %d\n", left); + ceph_zero_page_vector_range(page_off + read, left, pages); - read = len; - goto out; + read += left; } - - /* check i_size */ - *checkeof = 1; } -out: if (ret >= 0) ret = read; dout("striped_read returns %d\n", ret); @@ -659,7 +661,7 @@ out: /* hit EOF or hole? */ if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, reading more\n"); + dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); read += ret; base += ret; len -= ret; -- cgit v1.2.3-70-g09d2 From 0e98728fa32d338907631349a8cc2afa07c0cb9a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Jun 2011 20:40:35 -0700 Subject: ceph: fix ENOENT logic in striped_read Getting ENOENT is equivalent to reading 0 bytes. Make that correction before setting up the hit_stripe and was_short flags. Fixes the following case: dd if=/dev/zero of=/mnt/fs_depot/dd3 bs=1 seek=1048576 count=0 dd if=/mnt/fs_depot/dd3 of=/root/ddout1 skip=8 bs=500 count=2 iflag=direct Reported-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b654f403139..9542f07d0b9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -318,10 +318,10 @@ more: ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left, page_align); - hit_stripe = this_len < left; - was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) ret = 0; + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); -- cgit v1.2.3-70-g09d2 From 0c1f91f27140cf3b6e38dc4e892adac241c73a20 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 25 May 2011 14:56:12 -0700 Subject: ceph: unwind canceled flock state If we request a lock and then abort (e.g., ^C), we need to send a matching unlock request to the MDS to unwind our lock attempt to avoid indefinitely blocking other clients. Reported-by: Brian Chrisman Signed-off-by: Sage Weil --- fs/ceph/locks.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 7f0f72c53f7..80576d05d68 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -33,11 +33,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, length = fl->fl_end - fl->fl_start + 1; dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d", (int)lock_type, + "length: %llu, wait: %d, type: %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type); - req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); @@ -71,7 +70,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type, err); return err; @@ -110,16 +109,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock", err); + dout("got %d on posix_lock_file, undid lock", + err); } } - } else { - dout("mds returned error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -156,8 +159,11 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else { - dout("mds error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } -- cgit v1.2.3-70-g09d2 From 83fb086e0ecd879a4676cf12fc7afc1f9ecd1784 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 8 Jun 2011 07:35:24 -0400 Subject: cifs: trivial: add space in fsc error message Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fb31c2c1bb1..bb659eb7381 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1391,7 +1391,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, "/proc/fs/cifs/LookupCacheEnabled to 0\n"); } else if (strnicmp(data, "fsc", 3) == 0) { #ifndef CONFIG_CIFS_FSCACHE - cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " "kernel config option set"); goto cifs_parse_mount_err; #endif -- cgit v1.2.3-70-g09d2 From 86d4a77ba3dc4ace238a0556541a41df2bd71d49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:03:16 -0400 Subject: Btrfs: cache bitmaps when searching for a cluster If we are looking for a cluster in a particularly sparse or fragmented block group, we will do a lot of looping through the free space tree looking for various things, and if we need to look at bitmaps we will endup doing the whole dance twice. So instead add the bitmap entries to a temporary list so if we have to do the bitmap search we can just look through the list of entries we've found quickly instead of having to loop through the entire tree again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 54 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ad144736a5f..930c07f79b3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2144,6 +2144,7 @@ again: */ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2166,6 +2167,8 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * extent entry. */ while (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) return -ENOSPC; @@ -2185,8 +2188,12 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (entry->bitmap) + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); continue; + } + /* * we haven't filled the empty size and the window is * very large. reset and try again @@ -2240,6 +2247,7 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, */ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2250,10 +2258,39 @@ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (ctl->total_bitmaps == 0) return -ENOSPC; + /* + * First check our cached list of bitmaps and see if there is an entry + * here that will work. + */ + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * If we do have entries on our list and we are here then we didn't find + * anything, so go ahead and get the next entry after the last entry in + * this list and start the search from there. + */ + if (!list_empty(bitmaps)) { + entry = list_entry(bitmaps->prev, struct btrfs_free_space, + list); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + goto search; + } + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); if (!entry) return -ENOSPC; +search: node = &entry->offset_index; do { entry = rb_entry(node, struct btrfs_free_space, offset_index); @@ -2284,6 +2321,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct list_head bitmaps; + struct btrfs_free_space *entry, *tmp; u64 min_bytes; int ret; @@ -2322,11 +2361,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes, - min_bytes); + INIT_LIST_HEAD(&bitmaps); + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes, min_bytes); if (ret) - ret = setup_cluster_bitmap(block_group, cluster, offset, - bytes, min_bytes); + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); if (!ret) { atomic_inc(&block_group->count); -- cgit v1.2.3-70-g09d2 From 3de85bb95cc50d0977cbb7a0c605e894be4c790d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:07:37 -0400 Subject: Btrfs: noinline the cluster searching functions When profiling the find cluster code it's hard to tell where we are spending our time because the bitmap and non-bitmap functions get inlined by the compiler, so make that not happen. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 930c07f79b3..f56caacfd8a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2142,10 +2142,11 @@ again: /* * This searches the block group for just extents to fill the cluster with. */ -static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct list_head *bitmaps, - u64 offset, u64 bytes, u64 min_bytes) +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; @@ -2245,10 +2246,11 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * This specifically looks for bitmaps that may work in the cluster, we assume * that we have already failed to find extents that will work. */ -static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct list_head *bitmaps, - u64 offset, u64 bytes, u64 min_bytes) +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; -- cgit v1.2.3-70-g09d2 From f2bb8f5cfb3bce595b2de251ed7638047fc4e530 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:10:16 -0400 Subject: Btrfs: don't commit the transaction if we dont have enough pinned bytes I noticed when running an enospc test that we would get stuck committing the transaction in check_data_space even though we truly didn't have enough space. So check to see if bytes_pinned is bigger than num_bytes, if it's not don't commit the transaction. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b9b6b6df24..0d0a3fe77bb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3089,6 +3089,13 @@ alloc: } goto again; } + + /* + * If we have less pinned bytes than we want to allocate then + * don't bother committing the transaction, it won't help us. + */ + if (data_sinfo->bytes_pinned < bytes) + committed = 1; spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ -- cgit v1.2.3-70-g09d2 From 2cdc342c204dba69ca3b2ec43d8e6ff41ed920b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 27 May 2011 14:07:49 -0400 Subject: Btrfs: fix bitmap regression In cleaning up the clustering code I accidently introduced a regression by adding bitmap entries to the cluster rb tree. The problem is if we've maxed out the number of bitmaps we can have for the block group we can only add free space to the bitmaps, but since the bitmap is on the cluster we can't find it and we try to create another one. This would result in a panic because the total bitmaps was bigger than the max bitmaps that were allowed. This patch fixes this by checking to see if we have a cluster, and then looking at the cluster rb tree to see if it has a bitmap entry and if it does and that space belongs to that bitmap, go ahead and add it to that bitmap. I could hit this panic every time with an fs_mark test within a couple of minutes. With this patch I no longer hit the panic and fs_mark goes to completion. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 88 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f56caacfd8a..8258ccf85db 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1417,6 +1417,23 @@ again: return 0; } +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + u64 bytes_to_set = 0; + u64 end; + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { @@ -1453,12 +1470,18 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } +static struct btrfs_free_space_op free_space_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; + struct btrfs_block_group_cache *block_group = NULL; int added = 0; - u64 bytes, offset, end; + u64 bytes, offset, bytes_added; int ret; bytes = info->bytes; @@ -1467,6 +1490,47 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, if (!ctl->op->use_bitmap(ctl, info)) return 0; + if (ctl->op == &free_space_op) + block_group = ctl->private; + + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto again; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto again; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, + offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } again: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); @@ -1475,19 +1539,10 @@ again: goto new_bitmap; } - end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); - - if (offset >= bitmap_info->offset && offset + bytes > end) { - bitmap_set_bits(ctl, bitmap_info, offset, end - offset); - bytes -= end - offset; - offset = end; - added = 0; - } else if (offset >= bitmap_info->offset && offset + bytes <= end) { - bitmap_set_bits(ctl, bitmap_info, offset, bytes); - bytes = 0; - } else { - BUG(); - } + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + added = 0; if (!bytes) { ret = 1; @@ -1766,11 +1821,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, "\n", count); } -static struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; -- cgit v1.2.3-70-g09d2 From 723bda2083d44edbd6be0f0b09f902120dc07442 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 27 May 2011 16:11:38 -0400 Subject: Btrfs: fix the allocator loop logic I was testing with empty_cluster = 0 to try and reproduce a problem and kept hitting early enospc panics. This was because our loop logic was a little confused. So this is what I did 1) Make the loop variable the ultimate decider on wether we should loop again isntead of checking to see if we had an uncached bg, empty size or empty cluster. 2) Increment loop before checking to see what we are on to make the loop definitions make more sense. 3) If we are on the chunk alloc loop don't set empty_size/empty_cluster to 0 unless we didn't actually allocate a chunk. If we did allocate a chunk we should be able to easily setup a new cluster so clearing empty_size/empty_cluster makes us less efficient. This kept me from hitting panics while trying to reproduce the other problem. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d0a3fe77bb..b42efc2ded5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5218,9 +5218,7 @@ loop: * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && - (found_uncached_bg || empty_size || empty_cluster || - allowed_chunk_alloc)) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; @@ -5260,32 +5258,36 @@ loop: goto search; } - if (loop < LOOP_CACHING_WAIT) { - loop++; - goto search; - } + loop++; if (loop == LOOP_ALLOC_CHUNK) { - empty_size = 0; - empty_cluster = 0; - } + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); + allowed_chunk_alloc = 0; + if (ret == 1) + done_chunk_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == + CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; + } - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - allowed_chunk_alloc = 0; - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + /* + * We didn't allocate a chunk, go ahead and drop the + * empty size and loop again. + */ + if (!done_chunk_alloc) + loop = LOOP_NO_EMPTY_SIZE; } - if (loop < LOOP_NO_EMPTY_SIZE) { - loop++; - goto search; + if (loop == LOOP_NO_EMPTY_SIZE) { + empty_size = 0; + empty_cluster = 0; } - ret = -ENOSPC; + + goto search; } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { -- cgit v1.2.3-70-g09d2 From f6a398298d34af66ec3a2d82a44a4dbc5277357d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Jun 2011 10:50:35 -0400 Subject: Btrfs: fix duplicate checking logic When merging my code into the integration test the second check for duplicate entries got screwed up. This patch fixes it by dropping ret2 and just using ret for the return value, and checking if we got an error before adding the bitmap to the local list. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8258ccf85db..38f3fd92304 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -250,7 +250,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, pgoff_t index = 0; unsigned long first_page_offset; int num_checksums; - int ret = 0, ret2; + int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -421,11 +421,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } spin_lock(&ctl->tree_lock); - ret2 = link_free_space(ctl, e); + ret = link_free_space(ctl, e); ctl->total_bitmaps++; ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); - list_add_tail(&e->list, &bitmaps); if (ret) { printk(KERN_ERR "Duplicate entries in " "free space cache, dumping\n"); @@ -434,6 +433,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } + list_add_tail(&e->list, &bitmaps); } num_entries--; -- cgit v1.2.3-70-g09d2 From 25b8b936ed44814a5ce6fc3b2a21401f33cd56f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 Jun 2011 14:36:54 -0400 Subject: Btrfs: don't map extent buffer if path->skip_locking is set Arne's scrub stuff exposed a problem with mapping the extent buffer in reada_for_search. He searches the commit root with multiple threads and with skip_locking set, so we can race and overwrite node->map_token since node isn't locked. So fix this so that we only map the extent buffer if we don't already have a map_token and skip_locking isn't set. Without this patch scrub would panic almost immediately, with the patch it doesn't panic anymore. Thanks, Reported-by: Arne Jansen Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d84089349c8..2e667868e0d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1228,6 +1228,7 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; + bool map = true; if (level != 1) return; @@ -1249,8 +1250,11 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; + if (node->map_token || path->skip_locking) + map = false; + while (1) { - if (!node->map_token) { + if (map && !node->map_token) { unsigned long offset = btrfs_node_key_ptr_offset(nr); map_private_extent_buffer(node, offset, sizeof(struct btrfs_key_ptr), @@ -1277,7 +1281,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; @@ -1289,7 +1293,7 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; } -- cgit v1.2.3-70-g09d2 From 3473f3c06a36865ae05993041fff35ee928342a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 9 Jun 2011 10:15:17 -0400 Subject: Btrfs: unlock the trans lock properly In btrfs_wait_for_commit if we came upon a transaction that had committed we just exited, but that's bad since we are holding the trans_lock. So break instead so that the lock is dropped. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dd719662340..6b2e4786d18 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -349,7 +349,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) list) { if (t->in_commit) { if (t->commit_done) - goto out; + break; cur_trans = t; atomic_inc(&cur_trans->use_count); break; -- cgit v1.2.3-70-g09d2 From dac853ae89043f1b7752875300faf614de43c74b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 9 Jun 2011 20:05:18 +0200 Subject: exec: delay address limit change until point of no return Unconditionally changing the address limit to USER_DS and not restoring it to its old value in the error path is wrong because it prevents us using kernel memory on repeated calls to this function. This, in fact, breaks the fallback of hard coded paths to the init program from being ever successful if the first candidate fails to load. With this patch applied switching to USER_DS is delayed until the point of no return is reached which makes it possible to have a multi-arch rootfs with one arch specific init binary for each of the (hard coded) probed paths. Since the address limit is already set to USER_DS when start_thread() will be invoked, this redundancy can be safely removed. Signed-off-by: Mathias Krause Cc: Al Viro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - fs/exec.c | 5 +---- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af4..a3d0dc59067 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { set_user_gs(regs, 0); regs->fs = 0; - set_fs(USER_DS); regs->ds = __USER_DS; regs->es = __USER_DS; regs->ss = __USER_DS; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6c9dd922ac0..ca6f7ab8df3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -338,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - set_fs(USER_DS); /* * Free the old FP and other extended state */ diff --git a/fs/exec.c b/fs/exec.c index ea5f748906a..97e0d52d72f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1093,6 +1093,7 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + set_fs(USER_DS); current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1357,10 +1358,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (retval) return retval; - /* kernel module loader fixup */ - /* so we don't try to load run modprobe in kernel space. */ - set_fs(USER_DS); - retval = audit_bprm(bprm); if (retval) return retval; -- cgit v1.2.3-70-g09d2 From ad3e34bba4b64ab8e1f5ea1a17768e1a0d9648ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 Jun 2011 14:45:50 -0400 Subject: Btrfs: don't map extent buffer if path->skip_locking is set Arne's scrub stuff exposed a problem with mapping the extent buffer in reada_for_search. He searches the commit root with multiple threads and with skip_locking set, so we can race and overwrite node->map_token since node isn't locked. So fix this so that we only map the extent buffer if we don't already have a map_token and skip_locking isn't set. Without this patch scrub would panic almost immediately, with the patch it doesn't panic anymore. Thanks, Reported-by: Arne Jansen Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d84089349c8..2e667868e0d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1228,6 +1228,7 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; + bool map = true; if (level != 1) return; @@ -1249,8 +1250,11 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; + if (node->map_token || path->skip_locking) + map = false; + while (1) { - if (!node->map_token) { + if (map && !node->map_token) { unsigned long offset = btrfs_node_key_ptr_offset(nr); map_private_extent_buffer(node, offset, sizeof(struct btrfs_key_ptr), @@ -1277,7 +1281,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; @@ -1289,7 +1293,7 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; } -- cgit v1.2.3-70-g09d2 From 8c51032f978bac5bec5dae0c5de4f85db97c1cc9 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 3 Jun 2011 10:09:26 +0200 Subject: btrfs: scrub: errors in tree enumeration due to the semantics of btrfs_search_slot the path can point to an invalid slot when ret > 0. This condition went unnoticed, which in turn could have led to an incomplete scrubbing. Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 57 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index df50fd1eca8..d5a4108ceda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -804,18 +804,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != logical) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - goto out; - } + goto out_noplug; + /* + * we might miss half an extent here, but that doesn't matter, + * as it's only the prefetch + */ while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -824,7 +818,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (ret == 0) continue; if (ret < 0) - goto out; + goto out_noplug; break; } @@ -906,15 +900,20 @@ again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != logical) { + if (ret > 0) { ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); if (ret < 0) goto out; + if (ret > 0) { + /* there's no smaller item, so stick with the + * larger one */ + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } } while (1) { @@ -989,6 +988,7 @@ next: out: blk_finish_plug(&plug); +out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1064,8 +1064,15 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - ret = 0; + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + } l = path->nodes[0]; slot = path->slots[0]; @@ -1075,7 +1082,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) if (found_key.objectid != sdev->dev->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) @@ -1104,7 +1111,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) { ret = -ENOENT; - goto out; + break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, chunk_offset, length); @@ -1116,9 +1123,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) btrfs_release_path(path); } -out: btrfs_free_path(path); - return ret; + + /* + * ret can still be 1 from search_slot or next_leaf, + * that's not an error + */ + return ret < 0 ? ret : 0; } static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) -- cgit v1.2.3-70-g09d2 From 632dd772fcbde2ba37c0e8983bd38ef4a1eac906 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 12:07:07 +0200 Subject: btrfs: reinitialize scrub workers Scrub starts the workers each time a scrub starts and stops them after it finished. This patch adds an initialization for the workers before each start, otherwise the workers behave strangely. Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/scrub.c | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a203d363184..7bbbfebe47e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1668,8 +1668,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d5a4108ceda..92cac19388e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1166,8 +1166,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; mutex_lock(&fs_info->scrub_lock); - if (fs_info->scrub_workers_refcnt == 0) + if (fs_info->scrub_workers_refcnt == 0) { + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, &fs_info->generic_worker); + fs_info->scrub_workers.idle_thresh = 4; btrfs_start_workers(&fs_info->scrub_workers, 1); + } ++fs_info->scrub_workers_refcnt; mutex_unlock(&fs_info->scrub_lock); -- cgit v1.2.3-70-g09d2 From 6eef3125886df260ca0e8758d141308152226f6a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 13:04:58 +0200 Subject: btrfs: remove unneeded includes from scrub.c Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 92cac19388e..a8d03d5efb5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,13 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include -#include -#include #include -#include -#include -#include #include "ctree.h" #include "volumes.h" #include "disk-io.h" -- cgit v1.2.3-70-g09d2 From 38e87880666091fe9c572a7a2ed2e771d97ca5aa Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 10 Jun 2011 16:36:57 -0400 Subject: Btrfs: make sure to recheck for bitmaps in clusters Josef recently changed the free extent cache to look in the block group cluster for any bitmaps before trying to add a new bitmap for the same offset. This avoids BUG_ON()s due covering duplicate ranges. But it didn't go quite far enough. A given free range might span between one or more bitmaps or free space entries. The code has looping to cover this, but it doesn't check for clustered bitmaps every time. This shuffles our gotos to check for a bitmap in the cluster for every new bitmap entry we try to add. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 38f3fd92304..9f985a42987 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1492,7 +1492,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, if (ctl->op == &free_space_op) block_group = ctl->private; - +again: /* * Since we link bitmaps right into the cluster we need to see if we * have a cluster here, and if so and it has our bitmap we need to add @@ -1510,13 +1510,13 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, node = rb_first(&cluster->root); if (!node) { spin_unlock(&cluster->lock); - goto again; + goto no_cluster_bitmap; } entry = rb_entry(node, struct btrfs_free_space, offset_index); if (!entry->bitmap) { spin_unlock(&cluster->lock); - goto again; + goto no_cluster_bitmap; } if (entry->offset == offset_to_bitmap(ctl, offset)) { @@ -1531,7 +1531,8 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, goto out; } } -again: + +no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { -- cgit v1.2.3-70-g09d2 From 38e880540f983045da7a00fbc50daad238207fc5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Jun 2011 18:43:13 +0000 Subject: Btrfs: clear current->journal_info on async transaction commit Normally current->jouranl_info is cleared by commit_transaction. For an async snap or subvol creation, though, it runs in a work queue. Clear it in btrfs_commit_transaction_async() to avoid leaking a non-NULL journal_info when we return to userspace. When the actual commit runs in the other thread it won't care that it's current->journal_info is already NULL. Signed-off-by: Sage Weil Tested-by: Jim Schutt Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6b2e4786d18..2b3590b9fe9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1118,8 +1118,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); - put_transaction(cur_trans); + if (current->journal_info == trans) + current->journal_info = NULL; + + put_transaction(cur_trans); return 0; } -- cgit v1.2.3-70-g09d2 From 9eb9104c665aae2401a1723c044669eb10240072 Mon Sep 17 00:00:00 2001 From: richard kennedy Date: Tue, 7 Jun 2011 10:46:32 +0000 Subject: btrfs: remove 64bit alignment padding to allow extent_buffer to fit into one fewer cacheline Reorder extent_buffer to remove 8 bytes of alignment padding on 64 bit builds. This shrinks its size to 128 bytes allowing it to fit into one fewer cache lines and allows more objects per slab in its kmem_cache. slabinfo extent_buffer reports :- before:- Sizes (bytes) Slabs ---------------------------------- Object : 136 Total : 123 SlabObj: 136 Full : 121 SlabSiz: 4096 Partial: 0 Loss : 0 CpuSlab: 2 Align : 8 Objects: 30 after :- Object : 128 Total : 4 SlabObj: 128 Full : 2 SlabSiz: 4096 Partial: 0 Loss : 0 CpuSlab: 2 Align : 8 Objects: 32 Signed-off-by: Richard Kennedy Signed-off-by: Chris Mason --- fs/btrfs/extent_io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4e8445a4757..a11a92ee2d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -126,9 +126,9 @@ struct extent_buffer { unsigned long map_len; struct page *first_page; unsigned long bflags; - atomic_t refs; struct list_head leak_list; struct rcu_head rcu_head; + atomic_t refs; /* the spinlock is used to protect most operations */ spinlock_t lock; -- cgit v1.2.3-70-g09d2 From 027ed2f0044e95a97ed34db2d55a9ca95ba84385 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Jun 2011 08:27:56 +0000 Subject: Btrfs: avoid stack bloat in btrfs_ioctl_fs_info() The size of struct btrfs_ioctl_fs_info_args is as big as 1KB, so don't declare the variable on stack. Signed-off-by: Li Zefan Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac37040e426..b793d112d1f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2054,29 +2054,34 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { - struct btrfs_ioctl_fs_info_args fi_args; + struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - fi_args.num_devices = fs_devices->num_devices; - fi_args.max_id = 0; - memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid)); + fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); + if (!fi_args) + return -ENOMEM; + + fi_args->num_devices = fs_devices->num_devices; + memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->devid > fi_args.max_id) - fi_args.max_id = device->devid; + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; } mutex_unlock(&fs_devices->device_list_mutex); - if (copy_to_user(arg, &fi_args, sizeof(fi_args))) - return -EFAULT; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; - return 0; + kfree(fi_args); + return ret; } static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) -- cgit v1.2.3-70-g09d2 From 5be76758f35ec6578e5b9b150aa513ac26bd9c54 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jun 2011 10:02:51 +0000 Subject: btrfs: fix unlocked access of delalloc_inodes list_splice_init will make delalloc_inodes empty, but without a spinlock around, this may produce corrupted list head, accessed in many placess, The race window is very tight and nobody seems to have hit it so far. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a203d363184..33b744a5ac0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2911,9 +2911,8 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(&splice); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); - spin_lock(&root->fs_info->delalloc_lock); + list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, -- cgit v1.2.3-70-g09d2 From 08d2f347e877e489ca098c87a6fd2e872fef9767 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 4 May 2011 16:18:50 +0200 Subject: Btrfs: fix extent state leak on failed nodatasum reads When encountering an EIO while reading from a nodatasum extent, we insert an error record into the inode's failure tree. btrfs_readpage_end_io_hook returns early for nodatasum inodes. We'd better clear the failure tree in that case, otherwise the kernel complains about BUG extent_state: Objects remaining on kmem_cache_close() on rmmod. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 02ff4a1b968..113913ae36e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1986,7 +1986,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; + goto good; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { -- cgit v1.2.3-70-g09d2 From 22b63a2971c5657dfc1bf4514f9410fc90c8b2c2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Feb 2011 16:05:31 +0200 Subject: Btrfs - use %pU to print fsid Get rid of FIXME comment. Uuids from dmesg are now the same as uuids given by btrfs-progs. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da541dfca2e..1efa56e18f9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -689,12 +689,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); if (disk_super->label[0]) printk(KERN_INFO "device label %s ", disk_super->label); - else { - /* FIXME, make a readl uuid parser */ - printk(KERN_INFO "device fsid %llx-%llx ", - *(unsigned long long *)disk_super->fsid, - *(unsigned long long *)(disk_super->fsid + 8)); - } + else + printk(KERN_INFO "device fsid %pU ", disk_super->fsid); printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); -- cgit v1.2.3-70-g09d2 From fe744fdb74f2417d8571faefa45f72b0ead25f89 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 25 May 2011 23:00:27 +0900 Subject: nilfs2: fix incorrect block address termination in node concatenation nilfs_btree_delete function wrongly terminates virtual block address of the btree node held by its parent at index 0. When concatenating the index-0 node with its right sibling node, nilfs_btree_delete terminates the block address of index-0 node instead of the right sibling node which should be deleted. This bug not only wears disk space in the long run, but also causes file system corruption. This will fix it. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btree.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 7eafe468a29..fd4f05b9194 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1356,20 +1356,19 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ncmin, ncmax, ncblk, ret; + int pindex, dindex, level, ncmin, ncmax, ncblk, ret; ret = 0; stats->bs_nblocks = 0; ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); ncblk = nilfs_btree_nchildren_per_block(btree); - for (level = NILFS_BTREE_LEVEL_NODE_MIN; + for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(node, path[level].bp_index, - ncblk); + nilfs_btree_node_get_ptr(node, dindex, ncblk); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) @@ -1383,6 +1382,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; + dindex = pindex; if (pindex > 0) { /* left sibling */ @@ -1421,6 +1421,14 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_concat_right; stats->bs_nblocks++; + /* + * When merging right sibling node + * into the current node, pointer to + * the right sibling node must be + * terminated instead. The adjustment + * below is required for that. + */ + dindex = pindex + 1; /* continue; */ } } else { @@ -1443,7 +1451,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(node, path[level].bp_index, + nilfs_btree_node_get_ptr(node, dindex, NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); -- cgit v1.2.3-70-g09d2 From d40990537c9ea85dfe75dbe0ffba5e1002dfdf3f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 25 May 2011 23:00:27 +0900 Subject: nilfs2: fix missing block address termination in btree node shrinking nilfs_btree_delete function does not terminate part of virtual block addresses when shrinking the last remaining child node into the root node. The missing address termination causes that dead btree node blocks persist and chip away free disk space. This fixes the leak bug on the btree node deletion. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btree.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index fd4f05b9194..b2e3ff34762 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1346,6 +1346,11 @@ static void nilfs_btree_shrink(struct nilfs_bmap *btree, path[level].bp_bh = NULL; } +static void nilfs_btree_nop(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ +} static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, @@ -1439,16 +1444,22 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; + level++; + path[level].bp_op = nilfs_btree_nop; + goto shrink_root_child; } else { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; + goto out; } - - goto out; - } } + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + +shrink_root_child: node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(node, dindex, @@ -1458,10 +1469,6 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, if (ret < 0) goto err_out_child_node; - /* child of the root node is deleted */ - path[level].bp_op = nilfs_btree_do_delete; - stats->bs_nblocks++; - /* success */ out: *levelp = level; -- cgit v1.2.3-70-g09d2 From 071d73cfe5c38cf62338b952bd350ff3de541b75 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 10 Jun 2011 00:33:06 +0900 Subject: nilfs2: fix problem in setting checkpoint interval Checkpoint generation interval of nilfs goes wrong after user has changed the interval parameter with nilfs-tune tool. segctord starting. Construction interval = 5 seconds, CP frequency < 30 seconds segctord starting. Construction interval = 0 seconds, CP frequency < 30 seconds This turned out to be caused by a trivial bug in initialization code of log writer. This will fix it. Reported-by: Andrea Gelmini Signed-off-by: Ryusuke Konishi --- fs/nilfs2/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 141646e88fb..bb24ab6c282 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2573,7 +2573,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; if (nilfs->ns_interval) - sci->sc_interval = nilfs->ns_interval; + sci->sc_interval = HZ * nilfs->ns_interval; if (nilfs->ns_watermark) sci->sc_watermark = nilfs->ns_watermark; return sci; -- cgit v1.2.3-70-g09d2 From 30b4caf5d73af5c99cf1b2b46496d8bc35330992 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Jun 2011 03:56:44 +0000 Subject: Btrfs: use join_transaction in btrfs_evict_inode() The WARN_ON() in start_transaction() was triggered while balancing. The cause is btrfs_relocate_chunk() started a transaction and then called iput() on the inode that stores free space cache, and iput() called btrfs_start_transaction() again. Reported-by: Tsutomu Itoh Signed-off-by: Li Zefan Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 113913ae36e..c15636b1787 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3646,7 +3646,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 0); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = root->orphan_block_rsv; -- cgit v1.2.3-70-g09d2 From ff78fca2a03c08436535d3f7152a30752d8131d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 09:42:17 -0400 Subject: fix leak in proc_set_super() set_anon_super() can fail... Signed-off-by: Al Viro --- fs/proc/root.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index a9000e9cfee..d6c3b416529 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -28,11 +28,12 @@ static int proc_test_super(struct super_block *sb, void *data) static int proc_set_super(struct super_block *sb, void *data) { - struct pid_namespace *ns; - - ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - return set_anon_super(sb, NULL); + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; } static struct dentry *proc_mount(struct file_system_type *fs_type, -- cgit v1.2.3-70-g09d2 From b1c27ab3f93daede979f804afc38b189c2f17c60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 10:07:03 -0400 Subject: ubifs: split allocation of ubifs_info into a separate function preparation to ubifs sget() race fixes Signed-off-by: Al Viro --- fs/ubifs/super.c | 87 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b5aeb5a8ebe..ddc3b02e8cf 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1971,6 +1971,53 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) return ERR_PTR(-EINVAL); } +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) +{ + struct ubifs_info *c; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} + static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubi_volume_desc *ubi = sb->s_fs_info; @@ -1978,49 +2025,11 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; int err; - c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + c = alloc_ubifs_info(ubi); if (!c) return -ENOMEM; - spin_lock_init(&c->cnt_lock); - spin_lock_init(&c->cs_lock); - spin_lock_init(&c->buds_lock); - spin_lock_init(&c->space_lock); - spin_lock_init(&c->orphan_lock); - init_rwsem(&c->commit_sem); - mutex_init(&c->lp_mutex); - mutex_init(&c->tnc_mutex); - mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); - mutex_init(&c->umount_mutex); - mutex_init(&c->bu_mutex); - mutex_init(&c->write_reserve_mutex); - init_waitqueue_head(&c->cmt_wq); - c->buds = RB_ROOT; - c->old_idx = RB_ROOT; - c->size_tree = RB_ROOT; - c->orph_tree = RB_ROOT; - INIT_LIST_HEAD(&c->infos_list); - INIT_LIST_HEAD(&c->idx_gc); - INIT_LIST_HEAD(&c->replay_list); - INIT_LIST_HEAD(&c->replay_buds); - INIT_LIST_HEAD(&c->uncat_list); - INIT_LIST_HEAD(&c->empty_list); - INIT_LIST_HEAD(&c->freeable_list); - INIT_LIST_HEAD(&c->frdi_idx_list); - INIT_LIST_HEAD(&c->unclean_leb_list); - INIT_LIST_HEAD(&c->old_buds); - INIT_LIST_HEAD(&c->orph_list); - INIT_LIST_HEAD(&c->orph_new); - c->no_chk_data_crc = 1; - c->vfs_sb = sb; - c->highest_inum = UBIFS_FIRST_INO; - c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; - - ubi_get_volume_info(ubi, &c->vi); - ubi_get_device_info(c->vi.ubi_num, &c->di); - /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { -- cgit v1.2.3-70-g09d2 From d251ed271d528afb407cc2ede30923e34cb209a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 10:24:33 -0400 Subject: ubifs: fix sget races * allocate ubifs_info in ->mount(), fill it enough for sb_test() and set ->s_fs_info to it in set() callback passed to sget(). * do *not* free it in ->put_super(); do that in ->kill_sb() after we'd done kill_anon_super(). * don't free it in ubifs_fill_super() either - deactivate_locked_super() done by caller when ubifs_fill_super() returns an error will take care of that sucker. * get rid of kludge with passing ubi to ubifs_fill_super() in ->s_fs_info; we only need it in alloc_ubifs_info(), so ubifs_fill_super() will need only ubifs_info. Which it will find in ->s_fs_info just fine, no need to reassign anything... As the result, sb_test() becomes safe to apply to all superblocks that can be found by sget() (and a kludge with temporary use of ->s_fs_info to store a pointer to very different structure goes away). Signed-off-by: Al Viro --- fs/ubifs/super.c | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc3b02e8cf..8c892c2d530 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1848,7 +1848,6 @@ static void ubifs_put_super(struct super_block *sb) bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); - kfree(c); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -2020,21 +2019,16 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { - struct ubi_volume_desc *ubi = sb->s_fs_info; - struct ubifs_info *c; + struct ubifs_info *c = sb->s_fs_info; struct inode *root; int err; - c = alloc_ubifs_info(ubi); - if (!c) - return -ENOMEM; - c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); - goto out_free; + goto out; } /* @@ -2100,24 +2094,29 @@ out_bdi: bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); -out_free: - kfree(c); +out: return err; } static int sb_test(struct super_block *sb, void *data) { - dev_t *dev = data; + struct ubifs_info *c1 = data; struct ubifs_info *c = sb->s_fs_info; - return c->vi.cdev == *dev; + return c->vi.cdev == c1->vi.cdev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, const char *name, void *data) { struct ubi_volume_desc *ubi; - struct ubi_volume_info vi; + struct ubifs_info *c; struct super_block *sb; int err; @@ -2134,19 +2133,24 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } - ubi_get_volume_info(ubi, &vi); - dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } + + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev); + sb = sget(fs_type, sb_test, sb_set, c); if (IS_ERR(sb)) { err = PTR_ERR(sb); - goto out_close; + kfree(c); } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; - + kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(flags & MS_RDONLY) != c1->ro_mount) { @@ -2155,11 +2159,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, } } else { sb->s_flags = flags; - /* - * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is - * replaced by 'c'. - */ - sb->s_fs_info = ubi; err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) goto out_deact; @@ -2179,11 +2178,18 @@ out_close: return ERR_PTR(err); } +static void kill_ubifs_super(struct super_block *s) +{ + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .mount = ubifs_mount, - .kill_sb = kill_anon_super, + .kill_sb = kill_ubifs_super, }; /* -- cgit v1.2.3-70-g09d2 From dde194a64bb5c3fd05d965775dc92e8a4920a53a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 16:01:21 -0400 Subject: afs: fix sget() races, close leak on umount * set ->s_fs_info in set() callback passed to sget() * allocate the thing and set it up enough for afs_test_super() before making it visible * have it freed in ->kill_sb() (current tree simply leaks it) * have ->put_super() leave ->s_fs_info->volume alone; it's too early for dropping it; do that from ->kill_sb() after having called kill_anon_super(). Signed-off-by: Al Viro --- fs/afs/super.c | 73 +++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/afs/super.c b/fs/afs/super.c index fb240e8766d..b7d48d7eaa1 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -31,8 +31,8 @@ static void afs_i_init_once(void *foo); static struct dentry *afs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); -static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -40,7 +40,7 @@ struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", .mount = afs_mount, - .kill_sb = kill_anon_super, + .kill_sb = afs_kill_super, .fs_flags = 0, }; @@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = { .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .evict_inode = afs_evict_inode, - .put_super = afs_put_super, .show_options = generic_show_options, }; @@ -282,19 +281,25 @@ static int afs_parse_device_name(struct afs_mount_params *params, */ static int afs_test_super(struct super_block *sb, void *data) { - struct afs_mount_params *params = data; + struct afs_super_info *as1 = data; struct afs_super_info *as = sb->s_fs_info; - return as->volume == params->volume; + return as->volume == as1->volume; +} + +static int afs_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, void *data) +static int afs_fill_super(struct super_block *sb, + struct afs_mount_params *params) { - struct afs_mount_params *params = data; - struct afs_super_info *as = NULL; + struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; struct dentry *root = NULL; struct inode *inode = NULL; @@ -302,22 +307,11 @@ static int afs_fill_super(struct super_block *sb, void *data) _enter(""); - /* allocate a superblock info record */ - as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); - if (!as) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - afs_get_volume(params->volume); - as->volume = params->volume; - /* fill in the superblock */ sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_fs_info = as; sb->s_bdi = &as->volume->bdi; /* allocate the root inode and dentry */ @@ -326,7 +320,7 @@ static int afs_fill_super(struct super_block *sb, void *data) fid.unique = 1; inode = afs_iget(sb, params->key, &fid, NULL, NULL); if (IS_ERR(inode)) - goto error_inode; + return PTR_ERR(inode); if (params->autocell) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); @@ -342,16 +336,8 @@ static int afs_fill_super(struct super_block *sb, void *data) _leave(" = 0"); return 0; -error_inode: - ret = PTR_ERR(inode); - inode = NULL; error: iput(inode); - afs_put_volume(as->volume); - kfree(as); - - sb->s_fs_info = NULL; - _leave(" = %d", ret); return ret; } @@ -367,6 +353,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, struct afs_volume *vol; struct key *key; char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_super_info *as; int ret; _enter(",,%s,%p", dev_name, options); @@ -399,12 +386,22 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, ret = PTR_ERR(vol); goto error; } - params.volume = vol; + + /* allocate a superblock info record */ + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (!as) { + ret = -ENOMEM; + afs_put_volume(vol); + goto error; + } + as->volume = vol; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, set_anon_super, ¶ms); + sb = sget(fs_type, afs_test_super, afs_set_super, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); + afs_put_volume(vol); + kfree(as); goto error; } @@ -422,16 +419,16 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + afs_put_volume(vol); + kfree(as); } - afs_put_volume(params.volume); afs_put_cell(params.cell); kfree(new_opts); _leave(" = 0 [%p]", sb); return dget(sb->s_root); error: - afs_put_volume(params.volume); afs_put_cell(params.cell); key_put(params.key); kfree(new_opts); @@ -439,18 +436,12 @@ error: return ERR_PTR(ret); } -/* - * finish the unmounting process on the superblock - */ -static void afs_put_super(struct super_block *sb) +static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = sb->s_fs_info; - - _enter(""); - + kill_anon_super(sb); afs_put_volume(as->volume); - - _leave(""); + kfree(as); } /* -- cgit v1.2.3-70-g09d2 From a685e08987d1edf1995b76511d4c98ea0e905377 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Jun 2011 21:13:01 -0400 Subject: Delay struct net freeing while there's a sysfs instance refering to it * new refcount in struct net, controlling actual freeing of the memory * new method in kobj_ns_type_operations (->drop_ns()) * ->current_ns() semantics change - it's supposed to be followed by corresponding ->drop_ns(). For struct net in case of CONFIG_NET_NS it bumps the new refcount; net_drop_ns() decrements it and calls net_free() if the last reference has been dropped. Method renamed to ->grab_current_ns(). * old net_free() callers call net_drop_ns() instead. * sysfs_exit_ns() is gone, along with a large part of callchain leading to it; now that the references stored in ->ns[...] stay valid we do not need to hunt them down and replace them with NULL. That fixes problems in sysfs_lookup() and sysfs_readdir(), along with getting rid of sb->s_instances abuse. Note that struct net *shutdown* logics has not changed - net_cleanup() is called exactly when it used to be called. The only thing postponed by having a sysfs instance refering to that struct net is actual freeing of memory occupied by struct net. Signed-off-by: Al Viro --- fs/sysfs/mount.c | 37 +++++++++++-------------------------- fs/sysfs/sysfs.h | 2 +- include/linux/kobject_ns.h | 10 ++++++---- include/linux/sysfs.h | 7 ------- include/net/net_namespace.h | 10 +++++++++- lib/kobject.c | 26 +++++++++----------------- net/core/net-sysfs.c | 23 +++++++++-------------- net/core/net_namespace.c | 12 ++++++++++-- 8 files changed, 55 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 266895783b4..e34f0d99ea4 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -95,6 +95,14 @@ static int sysfs_set_super(struct super_block *sb, void *data) return error; } +static void free_sysfs_super_info(struct sysfs_super_info *info) +{ + int type; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + kobj_ns_drop(type, info->ns[type]); + kfree(info); +} + static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -108,11 +116,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, return ERR_PTR(-ENOMEM); for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_current(type); + info->ns[type] = kobj_ns_grab_current(type); sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + free_sysfs_super_info(info); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { @@ -131,12 +139,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, static void sysfs_kill_sb(struct super_block *sb) { struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing sysfs_super_info. */ kill_anon_super(sb); - kfree(info); + free_sysfs_super_info(info); } static struct file_system_type sysfs_fs_type = { @@ -145,28 +152,6 @@ static struct file_system_type sysfs_fs_type = { .kill_sb = sysfs_kill_sb, }; -void sysfs_exit_ns(enum kobj_ns_type type, const void *ns) -{ - struct super_block *sb; - - mutex_lock(&sysfs_mutex); - spin_lock(&sb_lock); - list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) { - struct sysfs_super_info *info = sysfs_info(sb); - /* - * If we see a superblock on the fs_supers/s_instances - * list the unmount has not completed and sb->s_fs_info - * points to a valid struct sysfs_super_info. - */ - /* Ignore superblocks with the wrong ns */ - if (info->ns[type] != ns) - continue; - info->ns[type] = NULL; - } - spin_unlock(&sb_lock); - mutex_unlock(&sysfs_mutex); -} - int __init sysfs_init(void) { int err = -ENOMEM; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3d28af31d86..2ed2404f311 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -136,7 +136,7 @@ struct sysfs_addrm_cxt { * instance). */ struct sysfs_super_info { - const void *ns[KOBJ_NS_TYPES]; + void *ns[KOBJ_NS_TYPES]; }; #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 82cb5bf461f..f66b065a8b5 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -32,15 +32,17 @@ enum kobj_ns_type { /* * Callbacks so sysfs can determine namespaces - * @current_ns: return calling task's namespace + * @grab_current_ns: return a new reference to calling task's namespace * @netlink_ns: return namespace to which a sock belongs (right?) * @initial_ns: return the initial namespace (i.e. init_net_ns) + * @drop_ns: drops a reference to namespace */ struct kobj_ns_type_operations { enum kobj_ns_type type; - const void *(*current_ns)(void); + void *(*grab_current_ns)(void); const void *(*netlink_ns)(struct sock *sk); const void *(*initial_ns)(void); + void (*drop_ns)(void *); }; int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); @@ -48,9 +50,9 @@ int kobj_ns_type_registered(enum kobj_ns_type type); const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); -const void *kobj_ns_current(enum kobj_ns_type type); +void *kobj_ns_grab_current(enum kobj_ns_type type); const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk); const void *kobj_ns_initial(enum kobj_ns_type type); -void kobj_ns_exit(enum kobj_ns_type type, const void *ns); +void kobj_ns_drop(enum kobj_ns_type type, void *ns); #endif /* _LINUX_KOBJECT_NS_H */ diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index c3acda60eee..e2696d76a59 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -177,9 +177,6 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd); void sysfs_put(struct sysfs_dirent *sd); -/* Called to clear a ns tag when it is no longer valid */ -void sysfs_exit_ns(enum kobj_ns_type type, const void *tag); - int __must_check sysfs_init(void); #else /* CONFIG_SYSFS */ @@ -338,10 +335,6 @@ static inline void sysfs_put(struct sysfs_dirent *sd) { } -static inline void sysfs_exit_ns(int type, const void *tag) -{ -} - static inline int __must_check sysfs_init(void) { return 0; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 2bf9ed9ef26..aef430d779b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -35,8 +35,11 @@ struct netns_ipvs; #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) struct net { + atomic_t passive; /* To decided when the network + * namespace should be freed. + */ atomic_t count; /* To decided when the network - * namespace should be freed. + * namespace should be shut down. */ #ifdef NETNS_REFCNT_DEBUG atomic_t use_count; /* To track references we @@ -154,6 +157,9 @@ int net_eq(const struct net *net1, const struct net *net2) { return net1 == net2; } + +extern void net_drop_ns(void *); + #else static inline struct net *get_net(struct net *net) @@ -175,6 +181,8 @@ int net_eq(const struct net *net1, const struct net *net2) { return 1; } + +#define net_drop_ns NULL #endif diff --git a/lib/kobject.c b/lib/kobject.c index 82dc34c095c..640bd98a4c8 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -948,14 +948,14 @@ const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj) } -const void *kobj_ns_current(enum kobj_ns_type type) +void *kobj_ns_grab_current(enum kobj_ns_type type) { - const void *ns = NULL; + void *ns = NULL; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type]) - ns = kobj_ns_ops_tbl[type]->current_ns(); + ns = kobj_ns_ops_tbl[type]->grab_current_ns(); spin_unlock(&kobj_ns_type_lock); return ns; @@ -987,23 +987,15 @@ const void *kobj_ns_initial(enum kobj_ns_type type) return ns; } -/* - * kobj_ns_exit - invalidate a namespace tag - * - * @type: the namespace type (i.e. KOBJ_NS_TYPE_NET) - * @ns: the actual namespace being invalidated - * - * This is called when a tag is no longer valid. For instance, - * when a network namespace exits, it uses this helper to - * make sure no sb's sysfs_info points to the now-invalidated - * netns. - */ -void kobj_ns_exit(enum kobj_ns_type type, const void *ns) +void kobj_ns_drop(enum kobj_ns_type type, void *ns) { - sysfs_exit_ns(type, ns); + spin_lock(&kobj_ns_type_lock); + if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && + kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns) + kobj_ns_ops_tbl[type]->drop_ns(ns); + spin_unlock(&kobj_ns_type_lock); } - EXPORT_SYMBOL(kobject_get); EXPORT_SYMBOL(kobject_put); EXPORT_SYMBOL(kobject_del); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 11b98bc2aa8..33d2a1fba13 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1179,9 +1179,14 @@ static void remove_queue_kobjects(struct net_device *net) #endif } -static const void *net_current_ns(void) +static void *net_grab_current_ns(void) { - return current->nsproxy->net_ns; + struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS + if (ns) + atomic_inc(&ns->passive); +#endif + return ns; } static const void *net_initial_ns(void) @@ -1196,22 +1201,13 @@ static const void *net_netlink_ns(struct sock *sk) struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, - .current_ns = net_current_ns, + .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, + .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); -static void net_kobj_ns_exit(struct net *net) -{ - kobj_ns_exit(KOBJ_NS_TYPE_NET, net); -} - -static struct pernet_operations kobj_net_ops = { - .exit = net_kobj_ns_exit, -}; - - #ifdef CONFIG_HOTPLUG static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) { @@ -1339,6 +1335,5 @@ EXPORT_SYMBOL(netdev_class_remove_file); int netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); - register_pernet_subsys(&kobj_net_ops); return class_register(&net_class); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6c6b86d0da1..cdcbc3cb00a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -128,6 +128,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); + atomic_set(&net->passive, 1); #ifdef NETNS_REFCNT_DEBUG atomic_set(&net->use_count, 0); @@ -210,6 +211,13 @@ static void net_free(struct net *net) kmem_cache_free(net_cachep, net); } +void net_drop_ns(void *p) +{ + struct net *ns = p; + if (ns && atomic_dec_and_test(&ns->passive)) + net_free(ns); +} + struct net *copy_net_ns(unsigned long flags, struct net *old_net) { struct net *net; @@ -230,7 +238,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) } mutex_unlock(&net_mutex); if (rv < 0) { - net_free(net); + net_drop_ns(net); return ERR_PTR(rv); } return net; @@ -286,7 +294,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - net_free(net); + net_drop_ns(net); } } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.2.3-70-g09d2 From 1fb74cda1b5e9c6207225fda5ef7504e815ce0e0 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 12 Jun 2011 22:44:10 -0400 Subject: jbd2: Remove obsolete parameters in the comments for some jbd2 functions credits isn't a parameter for jbd2_journal_get_write_access and jbd2_journal_get_undo_access. So remove the corresponding comments. Acked-by: Jan Kara Cc: Randy Dunlap Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3eec82d32fd..547b101049e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -814,7 +814,6 @@ out: * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -932,7 +931,6 @@ out: * non-rewindable consequences * @handle: transaction * @bh: buffer to undo - * @credits: store the number of taken credits here (if not NULL) * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses -- cgit v1.2.3-70-g09d2 From d4c208b86b8be4254eba0e74071496e599f94639 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Jun 2011 12:45:48 +0200 Subject: block: use the passed in @bdev when claiming if partno is zero 6b4517a791 (block: implement bd_claiming and claiming block) introduced claiming block to support O_EXCL blkdev opens properly. bd_start_claiming() looks up the part 0 bdev and starts claiming block. The function assumed that there is only one part 0 bdev and always used bdget_disk(disk, 0) to look it up; unfortunately, this isn't true for some drivers (floppy) which use multiple block devices to denote different operating parameters for the same physical device. There can be multiple part 0 bdev's for the same device number. This incorrect assumption caused the wrong bdev to be used during claiming leading to unbalanced bd_holders as reported in the following bug. https://bugzilla.kernel.org/show_bug.cgi?id=28522 This patch updates bd_start_claiming() such that it uses the bdev specified as argument if its partno is zero. Note that this means that different bdev's can be used for the same device and O_EXCL check can be effectively bypassed. It has always been broken that way and floppy is fortunately on its way out. Leave that breakage alone. Signed-off-by: Tejun Heo Reported-by: Alex Villacis Lasso Tested-by: Alex Villacis Lasso Cc: stable@kernel.org # >= v2.6.36 Signed-off-by: Jens Axboe --- fs/block_dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f..610e8e0b04b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -762,7 +762,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!disk) return ERR_PTR(-ENXIO); - whole = bdget_disk(disk, 0); + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + module_put(disk->fops->owner); put_disk(disk); if (!whole) -- cgit v1.2.3-70-g09d2 From ac08aedfa5d3de0dcb3825b598d16c2e57991f54 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:28:50 -0400 Subject: Btrfs: check the return value from set_anon_super Al Viro noticed we weren't checking for set_anon_super failures. This adds the required checks. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f68c689865..20c111b3fa0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1312,7 +1312,9 @@ again: spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); - set_anon_super(&root->anon_super, NULL); + ret = set_anon_super(&root->anon_super, NULL); + if (ret) + goto fail; if (btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; -- cgit v1.2.3-70-g09d2 From f4c44016218a6fce357715b9bbabbbbe1f69853c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:30:47 -0400 Subject: Btrfs: drop the delalloc_bytes check in shrink_delalloc Even when delalloc_bytes is zero, we may need to sleep while waiting for delalloc space. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b42efc2ded5..1f61bf5b496 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3314,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) - return 0; - max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { -- cgit v1.2.3-70-g09d2 From de1b794130b130e77ffa975bb58cb843744f9ae5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Jun 2011 15:38:22 -0400 Subject: jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/checkpoint.c | 28 +++++++++------- fs/jbd2/commit.c | 33 +++++++++++-------- fs/jbd2/journal.c | 91 ++++++++++++++++----------------------------------- fs/jbd2/transaction.c | 67 +++++++++++++++++++------------------ include/linux/jbd2.h | 2 -- 5 files changed, 99 insertions(+), 122 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6a79fd0a1a3..2c62c5aae82 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -97,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -223,8 +227,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -243,7 +247,6 @@ restart: */ released = __jbd2_journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); __brelse(bh); } @@ -284,7 +287,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, int ret = 0; if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -316,12 +319,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -554,7 +557,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and + * release them. * * Called with the journal locked. * Called with j_list_lock held. @@ -663,8 +667,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -684,13 +688,14 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + jbd2_journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -701,10 +706,8 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of jbd2_journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -723,7 +726,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -742,6 +744,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + jbd2_journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f21cf3aaf9..eef6979821a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -848,10 +848,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); @@ -914,28 +920,27 @@ restart_loop: __jbd2_journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __jbd2_journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - jbd2_journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9a782699030..0dfa5b598e6 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2078,10 +2078,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -2094,17 +2093,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = jbd2_journal_add_journal_head(bh); * ... + * (Get another reference for transaction) + * jbd2_journal_grab_journal_head(bh); * jh->b_transaction = xxx; + * (Put original reference) * jbd2_journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) @@ -2168,61 +2166,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd2_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void jbd2_journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void jbd2_journal_put_journal_head(struct journal_head *jh) @@ -2232,11 +2198,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 547b101049e..2d7109414cd 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,7 @@ #include static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); +static void __jbd2_journal_unfile_buffer(struct journal_head *jh); /* * jbd2_get_transaction: obtain a new transaction_t object. @@ -764,7 +765,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); @@ -895,8 +895,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; - /* first access by this transaction */ jh->b_modified = 0; @@ -1230,8 +1228,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1554,19 +1550,32 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } -void __jbd2_journal_unfile_buffer(struct journal_head *jh) +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ +static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1593,8 +1602,6 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1657,7 +1664,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * jbd2_journal_remove_journal_head() and * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); @@ -1695,10 +1701,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __jbd2_journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __jbd2_journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1709,8 +1714,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); + __jbd2_journal_unfile_buffer(jh); } return may_free; } @@ -1988,6 +1992,8 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __jbd2_journal_temp_unlink_buffer(jh); + else + jbd2_journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2039,9 +2045,10 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __jbd2_journal_refile_buffer(struct journal_head *jh) { @@ -2065,6 +2072,11 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __jbd2_journal_file_buffer() must not + * take a new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2081,30 +2093,21 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __jbd2_journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __jbd2_journal_refile_buffer() with necessary locking added. We take our + * bh reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 4ecb7b16b27..d087c2e7b2a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1024,7 +1024,6 @@ struct journal_s /* Filing buffers */ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); -extern void __jbd2_journal_unfile_buffer(struct journal_head *); extern void __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); @@ -1165,7 +1164,6 @@ extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_in */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); -void jbd2_journal_remove_journal_head(struct buffer_head *bh); void jbd2_journal_put_journal_head(struct journal_head *jh); /* -- cgit v1.2.3-70-g09d2 From cd51875d53ae1459a2b09b4338166a218c0635a7 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 9 Jun 2011 12:58:53 +0400 Subject: CIFS: Fix sparse error cifs_sb_master_tlink was declared as inline, but without a definition. Remove the declaration and move the definition up. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bb659eb7381..9190018f97e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2149,7 +2149,10 @@ cifs_put_tlink(struct tcon_link *tlink) } static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb); +cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) +{ + return cifs_sb->master_tlink; +} static int compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) @@ -3484,12 +3487,6 @@ out: return tcon; } -static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) -{ - return cifs_sb->master_tlink; -} - struct cifs_tcon * cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) { -- cgit v1.2.3-70-g09d2 From 7fdbaa1b8daa1009b705985b903e3d2ebccad456 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 10 Jun 2011 16:14:57 -0400 Subject: cifs: don't allow cifs_reconnect to exit with NULL socket pointer It's possible for the following set of events to happen: cifsd calls cifs_reconnect which reconnects the socket. A userspace process then calls cifs_negotiate_protocol to handle the NEGOTIATE and gets a reply. But, while processing the reply, cifsd calls cifs_reconnect again. Eventually the GlobalMid_Lock is dropped and the reply from the earlier NEGOTIATE completes and the tcpStatus is set to CifsGood. cifs_reconnect then goes through and closes the socket and sets the pointer to zero, but because the status is now CifsGood, the new socket is not created and cifs_reconnect exits with the socket pointer set to NULL. Fix this by only setting the tcpStatus to CifsGood if the tcpStatus is CifsNeedNegotiate, and by making sure that generic_ip_connect is always called at least once in cifs_reconnect. Note that this is not a perfect fix for this issue. It's still possible that the NEGOTIATE reply is handled after the socket has been closed and reconnected. In that case, the socket state will look correct but it no NEGOTIATE was performed on it be for the wrong socket. In that situation though the server should just shut down the socket on the next attempted send, rather than causing the oops that occurs today. Cc: # .38.x: fd88ce9: [CIFS] cifs: clarify the meaning of tcpStatus == CifsGood Reported-and-Tested-by: Ben Greear Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9190018f97e..fa73e2a51cf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -152,7 +152,7 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry->callback(mid_entry); } - while (server->tcpStatus == CifsNeedReconnect) { + do { try_to_freeze(); /* we should try only the port we connected to before */ @@ -167,7 +167,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); } - } + } while (server->tcpStatus == CifsNeedReconnect); return rc; } @@ -3374,7 +3374,7 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (server->tcpStatus != CifsExiting) + if (server->tcpStatus == CifsNeedNegotiate) server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; -- cgit v1.2.3-70-g09d2 From 3e715513643f0207c8f3c22010b54954cd697474 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 13 Jun 2011 11:50:41 -0400 Subject: cifs: show sec= option in /proc/mounts Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 989442dcfb4..e9def996e38 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -352,6 +352,37 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) } } +static void +cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) +{ + seq_printf(s, ",sec="); + + switch (server->secType) { + case LANMAN: + seq_printf(s, "lanman"); + break; + case NTLMv2: + seq_printf(s, "ntlmv2"); + break; + case NTLM: + seq_printf(s, "ntlm"); + break; + case Kerberos: + seq_printf(s, "krb5"); + break; + case RawNTLMSSP: + seq_printf(s, "ntlmssp"); + break; + default: + /* shouldn't ever happen */ + seq_printf(s, "unknown"); + break; + } + + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + seq_printf(s, "i"); +} + /* * cifs_show_options() is for displaying mount options in /proc/mounts. * Not all settable options are displayed but most of the important @@ -365,6 +396,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; + cifs_show_security(s, tcon->ses->server); + seq_printf(s, ",unc=%s", tcon->treeName); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) -- cgit v1.2.3-70-g09d2 From 8d1bca328b7c17af33bcf966d799c556ecbf370f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 11 Jun 2011 21:17:10 -0400 Subject: cifs: correctly handle NULL tcon pointer in CIFSTCon Long ago (in commit 00e485b0), I added some code to handle share-level passwords in CIFSTCon. That code ignored the fact that it's legit to pass in a NULL tcon pointer when connecting to the IPC$ share on the server. This wasn't really a problem until recently as we only called CIFSTCon this way when the server returned -EREMOTE. With the introduction of commit c1508ca2 however, it gets called this way on every mount, causing an oops when share-level security is in effect. Fix this by simply treating a NULL tcon pointer as if user-level security were in effect. I'm not aware of any servers that protect the IPC$ share with a specific password anyway. Also, add a comment to the top of CIFSTCon to ensure that we don't make the same mistake again. Cc: Reported-by: Martijn Uffing Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fa73e2a51cf..12cf72dd0c4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3174,6 +3174,10 @@ out: return rc; } +/* + * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon + * pointer may be NULL. + */ int CIFSTCon(unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, @@ -3208,7 +3212,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if ((ses->server->sec_mode) & SECMODE_USER) { + if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ -- cgit v1.2.3-70-g09d2 From 773e9b442693b250aa6c452cb0cf5a9343f51cef Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Jun 2011 20:57:14 -0700 Subject: ceph: fix page alignment corrections dd if=/dev/urandom of=/mnt/fs_depot/dd10 bs=500 seek=8388 count=1 dd if=/mnt/fs_depot/dd10 of=/root/dd10out bs=500 skip=8388 count=1 Reported-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9542f07d0b9..2be0f35afdf 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -290,7 +290,6 @@ static int striped_read(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int io_align, page_align; - int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; struct page **page_pos; @@ -326,12 +325,11 @@ more: ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret > 0) { - int didpages = - ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; + int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_off + read, + ceph_zero_page_vector_range(page_align + read, pos - off - read, pages); } pos += ret; @@ -356,7 +354,7 @@ more: left = inode->i_size - pos; dout("zero tail %d\n", left); - ceph_zero_page_vector_range(page_off + read, left, + ceph_zero_page_vector_range(page_align + read, left, pages); read += left; } -- cgit v1.2.3-70-g09d2 From d7f124f129a6aea99938e0d4172c741b56fefeda Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 13 Jun 2011 16:22:18 -0700 Subject: ceph: fix sync and dio writes across stripe boundaries We were iterating across stripe boundaries properly, but not moving the write buffer pointer forward. This caused us to rewrite the same data after the break. Fix by adjusting the data pointer forward, and recalculating the io and buffer alignment after the break. Signed-off-by: Sage Weil --- fs/ceph/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2be0f35afdf..4698a5c553d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -476,9 +476,6 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -502,6 +499,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, * boundary. this isn't atomic, unfortunately. :( */ more: + io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; len = left; if (file->f_flags & O_DIRECT) { /* write from beginning of first page, regardless of @@ -591,6 +590,7 @@ out: pos += len; written += len; left -= len; + data += written; if (left) goto more; -- cgit v1.2.3-70-g09d2 From 1123d93963cbd2546449d4d9f0c568e323cb0ac6 Mon Sep 17 00:00:00 2001 From: Max Asbock Date: Mon, 13 Jun 2011 10:18:32 -0700 Subject: timerfd: Fix wakeup of processes when timer is cancelled on clock change Currently processes waiting with poll on cancelable timerfd timers are not woken up when the timers are canceled. When the system time is set the clock_was_set() function calls timerfd_clock_was_set() to cancel and wake up processes waiting on potential cancelable timerfd timers. However the wake up currently has no effect because in the case of timerfd_read it is dependent on ctx->ticks not being 0. timerfd_poll also requires ctx->ticks being non zero. As a consequence processes waiting on cancelable timers only get woken up when the timers expire. This patch fixes this by incrementing ctx->ticks before calling wake_up. Signed-off-by: Max Asbock Cc: kay.sievers@vrfy.org Cc: virtuoso@slind.org Cc: johnstul Link: http://lkml.kernel.org/r/1307985512.4710.41.camel@w-amax.beaverton.ibm.com Signed-off-by: Thomas Gleixner --- fs/timerfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/timerfd.c b/fs/timerfd.c index f67acbdda5e..dffeb3795af 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -61,7 +61,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) /* * Called when the clock was set to cancel the timers in the cancel - * list. + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { @@ -76,6 +78,7 @@ void timerfd_clock_was_set(void) spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs.tv64 != moffs.tv64) { ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; wake_up_locked(&ctx->wqh); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); -- cgit v1.2.3-70-g09d2 From 040d15c86747cf44fcf6b8ee19d805d4ef20caf3 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 14 Jun 2011 15:51:18 +0000 Subject: [CIFS] trivial cleanup fscache cFYI and cERROR messages ... for uniformity and cleaner debug logs. Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/cache.c | 6 +++--- fs/cifs/fscache.c | 51 ++++++++++++++++++++++++--------------------------- 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index dd8584d35a1..545509c3313 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, break; default: - cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family); + cERROR(1, "Unknown network family '%d'", sa->sa_family); key_len = 0; break; } @@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { - cFYI(1, "CIFS: couldn't extract sharename\n"); + cFYI(1, "%s: couldn't extract sharename\n", __func__); sharename = NULL; return 0; } @@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) pagevec_init(&pvec, 0); first = 0; - cFYI(1, "cifs inode 0x%p now uncached", cifsi); + cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi); for (;;) { nr_pages = pagevec_lookup(&pvec, diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index d368a47ba5e..816696621ec 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, &cifs_fscache_server_index_def, server); - cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server, - server->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); } void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { - cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server, - server->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); fscache_relinquish_cookie(server->fscache, 0); server->fscache = NULL; } @@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, &cifs_fscache_super_index_def, tcon); - cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)", - server->fscache, tcon->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache, + tcon->fscache); } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache); + cFYI(1, "%s: (0x%p)", __func__, tcon->fscache); fscache_relinquish_cookie(tcon->fscache, 0); tcon->fscache = NULL; } @@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, - cifsi->fscache); + cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__, + tcon->fscache, cifsi->fscache); } } @@ -80,8 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "CIFS releasing inode cookie (0x%p)", - cifsi->fscache); + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 0); cifsi->fscache = NULL; } @@ -92,8 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "CIFS disabling inode cookie (0x%p)", - cifsi->fscache); + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; } @@ -121,8 +119,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", - cifsi->fscache, old); + cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p", + __func__, cifsi->fscache, old); } } @@ -132,8 +130,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) struct inode *inode = page->mapping->host; struct cifsInodeInfo *cifsi = CIFS_I(inode); - cFYI(1, "CIFS: fscache release page (0x%p/0x%p)", - page, cifsi->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, + cifsi->fscache); if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) return 0; } @@ -144,8 +142,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, int error) { - cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)", - page, error); + cFYI(1, "%s: (0x%p/%d)", __func__, page, error); if (!error) SetPageUptodate(page); unlock_page(page); @@ -158,7 +155,7 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p", + cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, cifs_readpage_from_fscache_complete, @@ -167,11 +164,11 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case 0: /* page found in fscache, read submitted */ - cFYI(1, "CIFS: readpage_from_fscache: submitted"); + cFYI(1, "%s: submitted", __func__); return ret; case -ENOBUFS: /* page won't be cached */ case -ENODATA: /* page not in cache */ - cFYI(1, "CIFS: readpage_from_fscache %d", ret); + cFYI(1, "%s: %d", __func__, ret); return 1; default: @@ -190,7 +187,7 @@ int __cifs_readpages_from_fscache(struct inode *inode, { int ret; - cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)", + cFYI(1, "%s: (0x%p/%u/0x%p)", __func__, CIFS_I(inode)->fscache, *nr_pages, inode); ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, pages, nr_pages, @@ -199,12 +196,12 @@ int __cifs_readpages_from_fscache(struct inode *inode, mapping_gfp_mask(mapping)); switch (ret) { case 0: /* read submitted to the cache for all pages */ - cFYI(1, "CIFS: readpages_from_fscache: submitted"); + cFYI(1, "%s: submitted", __func__); return ret; case -ENOBUFS: /* some pages are not cached and can't be */ case -ENODATA: /* some pages are not cached */ - cFYI(1, "CIFS: readpages_from_fscache: no page"); + cFYI(1, "%s: no page", __func__); return 1; default: @@ -218,7 +215,7 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p", + cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); if (ret != 0) @@ -230,7 +227,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct fscache_cookie *cookie = cifsi->fscache; - cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie); + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie); fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } -- cgit v1.2.3-70-g09d2 From c46a131c0c0f4c2457e6b1e430c578a5cb057334 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jun 2011 11:12:31 +0000 Subject: xfs: fix ->mknod() return value on xfs_get_acl() failure ->mknod() should return negative on errors and PTR_ERR() gives already negative value... Signed-off-by: Al Viro Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index dd21784525a..d44d92cd12b 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -182,7 +182,7 @@ xfs_vn_mknod( if (IS_POSIXACL(dir)) { default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(default_acl)) - return -PTR_ERR(default_acl); + return PTR_ERR(default_acl); if (!default_acl) mode &= ~current_umask(); -- cgit v1.2.3-70-g09d2 From 1252b3013b790c77e1c4f077a40542f86df37fb4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 14 Jun 2011 16:19:54 +0000 Subject: [CIFS] update cifs version to 1.73 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 64313f778eb..0900e1658c9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -129,5 +129,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.72" +#define CIFS_VERSION "1.73" #endif /* _CIFSFS_H */ -- cgit v1.2.3-70-g09d2 From 9e3bd4e24e94d60d2e0762e919aab6c9a7fc0c5b Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 31 May 2011 21:46:50 -0400 Subject: NFS: fix umount of pnfs filesystems Unmounting a pnfs filesystem hangs using filelayout and possibly others. This fixes the use of the rcu protected node by making use of a new 'tmpnode' for the temporary purge list. Also, the spinlock shouldn't be held when calling synchronize_rcu(). Signed-off-by: Weston Andros Adamson Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 1 + fs/nfs/pnfs_dev.c | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 48d0a8e4d06..96bf4e6f45b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -186,6 +186,7 @@ int pnfs_ld_read_done(struct nfs_read_data *); /* pnfs_dev.c */ struct nfs4_deviceid_node { struct hlist_node node; + struct hlist_node tmpnode; const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; struct nfs4_deviceid deviceid; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index c65e133ce9c..5944d4b369a 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -174,6 +174,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, const struct nfs4_deviceid *id) { INIT_HLIST_NODE(&d->node); + INIT_HLIST_NODE(&d->tmpnode); d->ld = ld; d->nfs_client = nfs_client; d->deviceid = *id; @@ -238,24 +239,29 @@ static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n, *next; + struct hlist_node *n; HLIST_HEAD(tmp); + spin_lock(&nfs4_deviceid_lock); rcu_read_lock(); hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); - hlist_add_head(&d->node, &tmp); + hlist_add_head(&d->tmpnode, &tmp); } rcu_read_unlock(); + spin_unlock(&nfs4_deviceid_lock); if (hlist_empty(&tmp)) return; synchronize_rcu(); - hlist_for_each_entry_safe(d, n, next, &tmp, node) + while (!hlist_empty(&tmp)) { + d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); + hlist_del(&d->tmpnode); if (atomic_dec_and_test(&d->ref)) d->ld->free_deviceid_node(d); + } } void @@ -263,8 +269,8 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp) { long h; - spin_lock(&nfs4_deviceid_lock); + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + return; for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) _deviceid_purge_client(clp, h); - spin_unlock(&nfs4_deviceid_lock); } -- cgit v1.2.3-70-g09d2 From 0b760113a3a155269a3fba93a409c640031dd68f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 31 May 2011 15:15:34 -0400 Subject: NLM: Don't hang forever on NLM unlock requests If the NLM daemon is killed on the NFS server, we can currently end up hanging forever on an 'unlock' request, instead of aborting. Basically, if the rpcbind request fails, or the server keeps returning garbage, we really want to quit instead of retrying. Tested-by: Vasily Averin Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/lockd/clntproc.c | 8 +++++++- include/linux/sunrpc/sched.h | 3 ++- net/sunrpc/clnt.c | 3 +++ net/sunrpc/sched.c | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index adb45ec9038..e374050a911 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -708,7 +708,13 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) if (task->tk_status < 0) { dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status); - goto retry_rebind; + switch (task->tk_status) { + case -EACCES: + case -EIO: + goto die; + default: + goto retry_rebind; + } } if (status == NLM_LCK_DENIED_GRACE_PERIOD) { rpc_delay(task, NLMCLNT_GRACE_WAIT); diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index f73c482ec9c..fe2d8e6b923 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -84,7 +84,8 @@ struct rpc_task { #endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, - tk_cred_retry : 2; + tk_cred_retry : 2, + tk_rebind_retry : 2; }; #define tk_xprt tk_client->cl_xprt diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b84d7395535..566bcfd067f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1175,6 +1175,9 @@ call_bind_status(struct rpc_task *task) status = -EOPNOTSUPP; break; } + if (task->tk_rebind_retry == 0) + break; + task->tk_rebind_retry--; rpc_delay(task, 3*HZ); goto retry_timeout; case -ETIMEDOUT: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6b43ee7221d..a27406b1654 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -792,6 +792,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta /* Initialize retry counters */ task->tk_garb_retry = 2; task->tk_cred_retry = 2; + task->tk_rebind_retry = 2; task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW; task->tk_owner = current->tgid; -- cgit v1.2.3-70-g09d2 From 0f66b5984df2fe1617c05900a39a7ef493ca9de9 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sat, 16 Oct 2010 22:07:46 -0700 Subject: NFS41: do not update isize if inode needs layoutcommit nfs_update_inode will update isize if there is no queued pages. For pNFS, layoutcommit is supposed to change file size on server, the same effect as queued pages. nfs_update_inode may be called when dirty pages are written back (nfsi->npages==0) but layoutcommit is not sent, and it will change client file size according to server file size. Then client ends up losing what it just writes back in pNFS path. So we should skip updating client file size if file needs layoutcommit. Signed-off-by: Peng Tao Cc: stable@kernel.org [2.6.39] Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 144f2a3c718..3f1eb818037 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1294,7 +1294,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { + if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || + new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } -- cgit v1.2.3-70-g09d2 From c9c30dd5f73dccaa326a54dfcf490316946aea87 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sat, 11 Jun 2011 17:08:39 -0400 Subject: NFSv4.1: deprecate headerpadsz in CREATE_SESSION We don't support header padding yet so better off ditching it Reported-by: Sid Moore Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ---- fs/nfs/nfs4xdr.c | 10 ++++++---- include/linux/nfs_xdr.h | 1 - 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2c4b59c896..3ca44978954 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5098,7 +5098,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) if (mxresp_sz == 0) mxresp_sz = NFS_MAX_FILE_IO_SIZE; /* Fore channel attributes */ - args->fc_attrs.headerpadsz = 0; args->fc_attrs.max_rqst_sz = mxrqst_sz; args->fc_attrs.max_resp_sz = mxresp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; @@ -5111,7 +5110,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.headerpadsz = 0; args->bc_attrs.max_rqst_sz = PAGE_SIZE; args->bc_attrs.max_resp_sz = PAGE_SIZE; args->bc_attrs.max_resp_sz_cached = 0; @@ -5131,8 +5129,6 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args struct nfs4_channel_attrs *sent = &args->fc_attrs; struct nfs4_channel_attrs *rcvd = &session->fc_attrs; - if (rcvd->headerpadsz > sent->headerpadsz) - return -EINVAL; if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d869a5e5464..c4b7d6c0494 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1725,7 +1725,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(args->flags); /*flags */ /* Fore Channel */ - *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(0); /* header padding size */ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ @@ -1734,7 +1734,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(0); /* header padding size */ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ @@ -4997,12 +4997,14 @@ static int decode_chan_attrs(struct xdr_stream *xdr, struct nfs4_channel_attrs *attrs) { __be32 *p; - u32 nr_attrs; + u32 nr_attrs, val; p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) goto out_overflow; - attrs->headerpadsz = be32_to_cpup(p++); + val = be32_to_cpup(p++); /* headerpadsz */ + if (val) + return -EINVAL; /* no support for header padding yet */ attrs->max_rqst_sz = be32_to_cpup(p++); attrs->max_resp_sz = be32_to_cpup(p++); attrs->max_resp_sz_cached = be32_to_cpup(p++); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 5e8444a11ad..00848d86ffb 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -158,7 +158,6 @@ struct nfs_seqid; /* nfs41 sessions channel attributes */ struct nfs4_channel_attrs { - u32 headerpadsz; u32 max_rqst_sz; u32 max_resp_sz; u32 max_resp_sz_cached; -- cgit v1.2.3-70-g09d2 From 1d92a08da23848a38eece4df7eaa4e8ec0e6c699 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Jun 2011 12:07:38 -0400 Subject: NFSv4.1: Fix a refcounting issue in the pNFS device id cache When we add something to the global device id cache, we need to bump the reference count, so that the cache itself holds a reference. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 5944d4b369a..f0f8e1e22f6 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -209,6 +209,7 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); spin_unlock(&nfs4_deviceid_lock); + atomic_inc(&new->ref); return new; } -- cgit v1.2.3-70-g09d2 From 533eb4611c9eea53072eb6a61d5a6393b6a77ed7 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 13 Jun 2011 18:25:56 -0400 Subject: NFSv4.1: allow nfs_fhget to succeed with mounted on fileid Commit 28331a46d88459788c8fca72dbb0415cd7f514c9 "Ensure we request the ordinary fileid when doing readdirplus" changed the meaning of NFS_ATTR_FATTR_FILEID which used to be set when FATTR4_WORD1_MOUNTED_ON_FILED was requested. Allow nfs_fhget to succeed with only a mounted on fileid when crossing a mountpoint or a referral. Ask for the fileid of the absent file system if mounted_on_fileid is not supported. Signed-off-by: Andy Adamson cc:stable@kernel.org [2.6.39] Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 ++- fs/nfs/internal.h | 11 +++++++++++ fs/nfs/nfs4proc.c | 33 +++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3f1eb818037..6f4850deb27 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -256,7 +256,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfs_attr_check_mountpoint(sb, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) + if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && + !nfs_attr_use_mounted_on_fileid(fattr)) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b9056cbe68d..2a55347a2da 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -45,6 +45,17 @@ static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + + fattr->fileid = fattr->mounted_on_fileid; + return 1; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3ca44978954..289c539bd6b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2265,12 +2265,14 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_map_errors(status); } +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); /* * Get locations and (maybe) other attributes of a referral. * Note that we'll actually follow the referral later when * we detect fsid mismatch in inode revalidation */ -static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle) +static int nfs4_get_referral(struct inode *dir, const struct qstr *name, + struct nfs_fattr *fattr, struct nfs_fh *fhandle) { int status = -ENOMEM; struct page *page = NULL; @@ -2288,15 +2290,16 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct goto out; /* Make sure server returned a different fsid for the referral */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { - dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name); + dprintk("%s: server did not return a different fsid for" + " a referral at %s\n", __func__, name->name); status = -EIO; goto out; } + /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ + nfs_fixup_referral_attributes(&locations->fattr); + /* replace the lookup nfs_fattr with the locations nfs_fattr */ memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); - fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL; - if (!fattr->mode) - fattr->mode = S_IFDIR; memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -4667,11 +4670,15 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, return len; } +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) { - if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && - (fattr->valid & NFS_ATTR_FATTR_FSID) && - (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || + (fattr->valid & NFS_ATTR_FATTR_FILEID)) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) return; fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | @@ -4686,7 +4693,6 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs_server *server = NFS_SERVER(dir); u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - [1] = FATTR4_WORD1_MOUNTED_ON_FILEID, }; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), @@ -4705,11 +4711,18 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, int status; dprintk("%s: start\n", __func__); + + /* Ask for the fileid of the absent filesystem if mounted_on_fileid + * is not supported */ + if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + else + bitmask[0] |= FATTR4_WORD0_FILEID; + nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); - nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } -- cgit v1.2.3-70-g09d2 From cec765cf5891c7fc3d905832b481bfb6fd55825d Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 13 Jun 2011 18:36:17 -0400 Subject: NFSv4.1: allow zero fh array in filelayout decode layout Signed-off-by: Andy Adamson cc:stable@kernel.org [2.6.39] Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 426908809c9..5d6f369b15d 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -552,13 +552,18 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __func__, nfl_util, fl->num_fh, fl->first_stripe_index, fl->pattern_offset); - if (!fl->num_fh) + /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. + * Futher checking is done in filelayout_check_layout */ + if (fl->num_fh < 0 || fl->num_fh > + max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; - fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), - gfp_flags); - if (!fl->fh_array) - goto out_err; + if (fl->num_fh > 0) { + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + gfp_flags); + if (!fl->fh_array) + goto out_err; + } for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ -- cgit v1.2.3-70-g09d2 From a2e1d4f2e5ed83850de92a491ef225824cb457bd Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Mon, 13 Jun 2011 18:54:53 -0400 Subject: nfs4.1: fix several problems with _pnfs_return_layout _pnfs_return_layout had the following problems: - it did not call pnfs_free_lseg_list on all paths - it unintentionally did a forgetful return when there was no outstanding io - it raced with concurrent LAYOUTGETS Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++---- fs/nfs/pnfs.c | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 289c539bd6b..5879b23e0c9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5706,6 +5706,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; dprintk("--> %s\n", __func__); @@ -5717,16 +5718,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) nfs_restart_rpc(task, lrp->clp); return; } + spin_lock(&lo->plh_inode->i_lock); if (task->tk_status == 0) { - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; - if (lrp->res.lrs_present) { - spin_lock(&lo->plh_inode->i_lock); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - spin_unlock(&lo->plh_inode->i_lock); } else BUG_ON(!list_empty(&lo->plh_segs)); } + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8c1309d852a..25de6b27bdf 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -634,12 +634,14 @@ _pnfs_return_layout(struct inode *ino) spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) { + if (!lo) { spin_unlock(&ino->i_lock); - dprintk("%s: no layout segments to return\n", __func__); - goto out; + dprintk("%s: no layout to return\n", __func__); + return status; } stateid = nfsi->layout->plh_stateid; + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + lo->plh_block_lgets++; /* Reference matched in nfs4_layoutreturn_release */ get_layout_hdr(lo); spin_unlock(&ino->i_lock); -- cgit v1.2.3-70-g09d2 From d771e3a43e23a37398b7e05a9d1b1036d698263c Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 14 Jun 2011 16:30:16 -0400 Subject: NFSv4.1: fix break condition in pnfs_find_lseg The break condition to skip out of the loop got broken when cmp_layout was change. Essentially, we want to stop looking once we know no layout on the remainder of the list can match the first byte of the looked-up range. Reported-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 25de6b27bdf..d066aad608a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -889,7 +889,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, ret = get_lseg(lseg); break; } - if (cmp_layout(range, &lseg->pls_range) > 0) + if (lseg->pls_range.offset > range->offset) break; } -- cgit v1.2.3-70-g09d2 From c7fd06228b994190d8369a2a0acf5224e4e13d1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Jun 2011 00:55:44 +0100 Subject: NFS: (d)printks should use %zd for ssize_t arguments (d)printks should use %zd for ssize_t arguments not %ld, otherwise they might get a warning. I see the following with MN10300. fs/nfs/objlayout/objlayout.c: In function 'objlayout_read_done': fs/nfs/objlayout/objlayout.c:294: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t' Signed-off-by: David Howells cc: Trond Myklebust cc: linux-nfs@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objlayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index dc3956c0de8..1d06f8e2ade 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -291,7 +291,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) struct nfs_read_data *rdata; state->status = status; - dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); + dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); rdata = state->rpcdata; rdata->task.tk_status = status; if (status >= 0) { -- cgit v1.2.3-70-g09d2 From 1ed3a8539af7b36aa5c977f304e80f7fc8d27bfc Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 15 Jun 2011 11:39:57 -0400 Subject: NFSv4.1: need to put_layout_hdr on _pnfs_return_layout error path We always get a reference on the layout header and we rely on nfs4_layoutreturn_release to put it. If we hit an allocation error before starting the rpc proc we bail out early without dereferncing the layout header properly. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d066aad608a..8f958228125 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -652,6 +652,7 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; + put_layout_hdr(lo); goto out; } -- cgit v1.2.3-70-g09d2 From ea0ded748bdea78f9e2fefb571f7d6ce9edb4f89 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 15 Jun 2011 12:31:02 -0400 Subject: nfs4.1: prevent race that allowed use of freed layout in _pnfs_return_layout mark_matching_lsegs_invalid could put the last ref to the layout, so the get_layout_hdr needs to be called first. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8f958228125..730d4dbbaf6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -640,10 +640,10 @@ _pnfs_return_layout(struct inode *ino) return status; } stateid = nfsi->layout->plh_stateid; - mark_matching_lsegs_invalid(lo, &tmp_list, NULL); - lo->plh_block_lgets++; /* Reference matched in nfs4_layoutreturn_release */ get_layout_hdr(lo); + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + lo->plh_block_lgets++; spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); -- cgit v1.2.3-70-g09d2 From 71d7aed014457147e8f71a843d5fbf03235e4a85 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 14:24:32 -0400 Subject: Btrfs: fix path leakage on subvol deletion The delayed ref patch accidently removed the btrfs_free_path in btrfs_unlink_subvol, this puts it back and means we don't leak a path. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c15636b1787..5813dec5101 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3076,6 +3076,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); + btrfs_free_path(path); return 0; } -- cgit v1.2.3-70-g09d2 From 8351583e3f6e430ce8f71913909a96ad5cc6a2f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 15:16:14 -0400 Subject: Btrfs: protect the pending_snapshots list with trans_lock Currently there is nothing protecting the pending_snapshots list on the transaction. We only hold the directory mutex that we are snapshotting and a read lock on the subvol_sem, so we could race with somebody else creating a snapshot in a different directory and end up with list corruption. So protect this list with the trans_lock. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b793d112d1f..a3c4751e07d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -482,8 +482,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); + spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); if (async_transid) { *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, -- cgit v1.2.3-70-g09d2 From ed0ca14021e5ae3147602128641aa7f742ab227c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 16:22:15 -0400 Subject: Btrfs: set no_trans_join after trying to expand the transaction We can lockup if we try to allow new writers join the transaction and we have flushoncommit set or have a pending snapshot. This is because we set no_trans_join and then loop around and try to wait for ordered extents again. The problem is the ordered endio stuff needs to join the transaction, which it can't do because no_trans_join is set. So instead wait until after this loop to set no_trans_join and then make sure to wait for num_writers == 1 in case anybody got started in between us exiting the loop and setting no_trans_join. This could easily be reproduced by mounting -o flushoncommit and running xfstest 13. It cannot be reproduced with this patch. Thanks, Reported-by: Jim Schutt Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b9fe9..56695595e03 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1241,12 +1241,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(&cur_trans->writer_wait, &wait); - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * no_join so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); -- cgit v1.2.3-70-g09d2 From 9e2dfdb3081edfae66a49013517e80dd8a0469fa Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 15 Jun 2011 14:32:02 -0400 Subject: nfs4.1: mark layout as bad on error path in _pnfs_return_layout Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 730d4dbbaf6..539b94cb6c0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -652,6 +652,8 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; + set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); + set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); put_layout_hdr(lo); goto out; } -- cgit v1.2.3-70-g09d2 From 793925334f32e9026c22baee5c3c340f47d4ef7e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 12:47:04 -0700 Subject: proc: Fix Oops on stat of /proc//ns/net Don't call iput with the inode half setup to be a namespace filedescriptor. Instead rearrange the code so that we don't initialize ei->ns_ops until after I ns_ops->get succeeds, preventing us from invoking ns_ops->put when ns_ops->get failed. Reported-by: Ingo Saitz Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 781dec5bd68..be177f702ac 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -38,18 +38,21 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); + void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + ei = PROC_I(inode); inode->i_mode = S_IFREG|S_IRUSR; inode->i_fop = &ns_file_operations; ei->ns_ops = ns_ops; - ei->ns = ns_ops->get(task); - if (!ei->ns) - goto out_iput; + ei->ns = ns; dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); -- cgit v1.2.3-70-g09d2 From 7f81c8890c15a10f5220bebae3b6dfae4961962a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Jun 2011 15:08:11 -0700 Subject: fs/exec.c: use BUILD_BUG_ON for VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP Commit a8bef8ff6ea1 ("mm: migration: avoid race between shift_arg_pages() and rmap_walk() during migration by not migrating temporary stacks") introduced a BUG_ON() to ensure that VM_STACK_FLAGS and VM_STACK_INCOMPLETE_SETUP do not overlap. The check is a compile time one, so BUILD_BUG_ON is more appropriate. Signed-off-by: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 97e0d52d72f..b54f74f3cd8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -277,7 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; -- cgit v1.2.3-70-g09d2 From 13fca640bb8ab611a50e0ba120b186faa2994d6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 15 Jun 2011 21:53:52 -0700 Subject: Revert "fs/exec.c: use BUILD_BUG_ON for VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP" This reverts commit 7f81c8890c15a10f5220bebae3b6dfae4961962a. It turns out that it's not actually a build-time check on x86-64 UML, which does some seriously crazy stuff with VM_STACK_FLAGS. The VM_STACK_FLAGS define depends on the arch-supplied VM_STACK_DEFAULT_FLAGS value, and on x86-64 UML we have arch/um/sys-x86_64/shared/sysdep/vm-flags.h: #define VM_STACK_DEFAULT_FLAGS \ (test_thread_flag(TIF_IA32) ? vm_stack_flags32 : vm_stack_flags) #define VM_STACK_DEFAULT_FLAGS vm_stack_flags (yes, seriously: two different #define's for that thing, with the first one being inside an "#ifdef TIF_IA32") It's possible that it is UML that should just be fixed in this area, but for now let's just undo the (very small) optimization. Reported-by: Randy Dunlap Acked-by: Andrew Morton Cc: Michal Hocko Cc: Richard Weinberger Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index b54f74f3cd8..97e0d52d72f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -277,7 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; -- cgit v1.2.3-70-g09d2 From 50338b889dc504c69e0cb316ac92d1b9e51f3c8a Mon Sep 17 00:00:00 2001 From: Török Edwin Date: Thu, 16 Jun 2011 00:06:14 +0300 Subject: fix wrong iput on d_inode introduced by e6bc45d65d MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Git bisection shows that commit e6bc45d65df8599fdbae73be9cec4ceed274db53 causes BUG_ONs under high I/O load: kernel BUG at fs/inode.c:1368! [ 2862.501007] Call Trace: [ 2862.501007] [] d_kill+0xf8/0x140 [ 2862.501007] [] dput+0xc9/0x190 [ 2862.501007] [] fput+0x15f/0x210 [ 2862.501007] [] filp_close+0x61/0x90 [ 2862.501007] [] sys_close+0xb1/0x110 [ 2862.501007] [] system_call_fastpath+0x16/0x1b A reliable way to reproduce this bug is: Login to KDE, run 'rsnapshot sync', and apt-get install openjdk-6-jdk, and apt-get remove openjdk-6-jdk. The buggy part of the patch is this: struct inode *inode = NULL; ..... - if (nd.last.name[nd.last.len]) - goto slashes; inode = dentry->d_inode; - if (inode) - ihold(inode); + if (nd.last.name[nd.last.len] || !inode) + goto slashes; + ihold(inode) ... if (inode) iput(inode); /* truncate the inode here */ If nd.last.name[nd.last.len] is nonzero (and thus goto slashes branch is taken), and dentry->d_inode is non-NULL, then this code now does an additional iput on the inode, which is wrong. Fix this by only setting the inode variable if nd.last.name[nd.last.len] is 0. Reference: https://lkml.org/lkml/2011/6/15/50 Reported-by: Norbert Preining Reported-by: Török Edwin Cc: "Theodore Ts'o" Cc: Al Viro Signed-off-by: Török Edwin Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9802345df5e..6301963b161 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2713,8 +2713,10 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ + if (nd.last.name[nd.last.len]) + goto slashes; inode = dentry->d_inode; - if (nd.last.name[nd.last.len] || !inode) + if (!inode) goto slashes; ihold(inode); error = mnt_want_write(nd.path.mnt); -- cgit v1.2.3-70-g09d2 From 8aef18845266f5c05904c610088f2d1ed58f6be3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Jun 2011 15:10:06 +0100 Subject: VFS: Fix vfsmount overput on simultaneous automount [Kudos to dhowells for tracking that crap down] If two processes attempt to cause automounting on the same mountpoint at the same time, the vfsmount holding the mountpoint will be left with one too few references on it, causing a BUG when the kernel tries to clean up. The problem is that lock_mount() drops the caller's reference to the mountpoint's vfsmount in the case where it finds something already mounted on the mountpoint as it transits to the mounted filesystem and replaces path->mnt with the new mountpoint vfsmount. During a pathwalk, however, we don't take a reference on the vfsmount if it is the same as the one in the nameidata struct, but do_add_mount() doesn't know this. The fix is to make sure we have a ref on the vfsmount of the mountpoint before calling do_add_mount(). However, if lock_mount() doesn't transit, we're then left with an extra ref on the mountpoint vfsmount which needs releasing. We can handle that in follow_managed() by not making assumptions about what we can and what we cannot get from lookup_mnt() as the current code does. The callers of follow_managed() expect that reference to path->mnt will be grabbed iff path->mnt has been changed. follow_managed() and follow_automount() keep track of whether such reference has been grabbed and assume that it'll happen in those and only those cases that'll have us return with changed path->mnt. That assumption is almost correct - it breaks in case of racing automounts and in even harder to hit race between following a mountpoint and a couple of mount --move. The thing is, we don't need to make that assumption at all - after the end of loop in follow_manage() we can check if path->mnt has ended up unchanged and do mntput() if needed. The BUG can be reproduced with the following test program: #include #include #include #include #include int main(int argc, char **argv) { int pid, ws; struct stat buf; pid = fork(); stat(argv[1], &buf); if (pid > 0) wait(&ws); return 0; } and the following procedure: (1) Mount an NFS volume that on the server has something else mounted on a subdirectory. For instance, I can mount / from my server: mount warthog:/ /mnt -t nfs4 -r On the server /data has another filesystem mounted on it, so NFS will see a change in FSID as it walks down the path, and will mark /mnt/data as being a mountpoint. This will cause the automount code to be triggered. !!! Do not look inside the mounted fs at this point !!! (2) Run the above program on a file within the submount to generate two simultaneous automount requests: /tmp/forkstat /mnt/data/testfile (3) Unmount the automounted submount: umount /mnt/data (4) Unmount the original mount: umount /mnt At this point the kernel should throw a BUG with something like the following: BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12] Note that the bug appears on the root dentry of the original mount, not the mountpoint and not the submount because sys_umount() hasn't got to its final mntput_no_expire() yet, but this isn't so obvious from the call trace: [] shrink_dcache_for_umount+0x69/0x82 [] generic_shutdown_super+0x37/0x15b [] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs] [] kill_anon_super+0x1d/0x7e [] nfs4_kill_super+0x60/0xb6 [nfs] [] deactivate_locked_super+0x34/0x83 [] deactivate_super+0x6f/0x7b [] mntput_no_expire+0x18d/0x199 [] mntput+0x3b/0x44 [] release_mounts+0xa2/0xbf [] sys_umount+0x47a/0x4ba [] ? trace_hardirqs_on_caller+0x1fd/0x22f [] system_call_fastpath+0x16/0x1b as do_umount() is inlined. However, you can see release_mounts() in there. Note also that it may be necessary to have multiple CPU cores to be able to trigger this bug. Tested-by: Jeff Layton Tested-by: Ian Kent Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 6301963b161..9e425e7e6c8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -812,6 +812,11 @@ static int follow_automount(struct path *path, unsigned flags, if (!mnt) /* mount collision */ return 0; + if (!*need_mntput) { + /* lock_mount() may release path->mnt on error */ + mntget(path->mnt); + *need_mntput = true; + } err = finish_automount(mnt, path); switch (err) { @@ -819,12 +824,9 @@ static int follow_automount(struct path *path, unsigned flags, /* Someone else made a mount here whilst we were busy */ return 0; case 0: - dput(path->dentry); - if (*need_mntput) - mntput(path->mnt); + path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); - *need_mntput = true; return 0; default: return err; @@ -844,9 +846,10 @@ static int follow_automount(struct path *path, unsigned flags, */ static int follow_managed(struct path *path, unsigned flags) { + struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; bool need_mntput = false; - int ret; + int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see @@ -861,7 +864,7 @@ static int follow_managed(struct path *path, unsigned flags) BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path->dentry, false); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; } /* Transit to a mounted filesystem. */ @@ -887,14 +890,19 @@ static int follow_managed(struct path *path, unsigned flags) if (managed & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, flags, &need_mntput); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; continue; } /* We didn't change the current path point */ break; } - return 0; + + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (ret == -EISDIR) + ret = 0; + return ret; } int follow_down_one(struct path *path) -- cgit v1.2.3-70-g09d2 From 5e7f23373bf9a853e9256e81e86724cdd0a33c29 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jun 2011 22:31:12 +0100 Subject: afs: afs_fill_page reads too much, or wrong data afs_fill_page should read the page that is about to be written but the current implementation has a number of issues. If we aren't extending the file we always read PAGE_CACHE_SIZE at offset 0. If we are extending the file we try to read the entire file. Change afs_fill_page to read PAGE_CACHE_SIZE at the right offset, clamped to i_size. While here, avoid calling afs_fill_page when we are doing a PAGE_CACHE_SIZE write. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/write.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 789b3afb342..b806285ff85 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,23 +84,21 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned len, struct page *page) + loff_t pos, struct page *page) { loff_t i_size; - unsigned eof; int ret; + int len; - _enter(",,%llu,%u", (unsigned long long)pos, len); - - ASSERTCMP(len, <=, PAGE_CACHE_SIZE); + _enter(",,%llu", (unsigned long long)pos); i_size = i_size_read(&vnode->vfs_inode); - if (pos + len > i_size) - eof = i_size; + if (pos + PAGE_CACHE_SIZE > i_size) + len = i_size - pos; else - eof = PAGE_CACHE_SIZE; + len = PAGE_CACHE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); + ret = afs_vnode_fetch_data(vnode, key, pos, len, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -153,9 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; /* page won't leak in error case: it eventually gets cleaned off LRU */ - if (!PageUptodate(page)) { - _debug("not up to date"); - ret = afs_fill_page(vnode, key, pos, len, page); + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); -- cgit v1.2.3-70-g09d2 From f9f07b6c1372b1436aa6b45333445b443ffd8c95 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 14 Jun 2011 00:58:27 +0200 Subject: vfs: Fix data corruption after failed write in __block_write_begin() I've got a report of a file corruption from fsxlinux on ext3. The important operations to the page were: mapwrite to a hole partial write to the page read - found the page zeroed from the end of the normal write The culprit seems to be that if get_block() fails in __block_write_begin() (e.g. transient ENOSPC in ext3), the function does ClearPageUptodate(page). Thus when we retry the write, the logic in __block_write_begin() thinks zeroing of the page is needed and overwrites old data. In fact, I don't see why we should ever need to zero the uptodate bit here - either the page was uptodate when we entered __block_write_begin() and it should stay so when we leave it, or it was not uptodate and noone had right to set it uptodate during __block_write_begin() so it remains !uptodate when we leave as well. So just remove clearing of the bit. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 49c9aada037..1a80b048ade 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1902,10 +1902,8 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (unlikely(err)) { + if (unlikely(err)) page_zero_new_buffers(page, from, to); - ClearPageUptodate(page); - } return err; } EXPORT_SYMBOL(__block_write_begin); -- cgit v1.2.3-70-g09d2 From 2e41ae225f742ded5b7d9847cd8bd605f27daba8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jun 2011 00:38:44 +0100 Subject: AFS: Set s_id in the superblock to the volume name Set s_id in the superblock to the name of the AFS volume that this superblock corresponds to. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/super.c b/fs/afs/super.c index b7d48d7eaa1..356dcf0929e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -313,6 +313,7 @@ static int afs_fill_super(struct super_block *sb, sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; sb->s_bdi = &as->volume->bdi; + strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ fid.vid = as->volume->vid; -- cgit v1.2.3-70-g09d2 From d6e43f751f252c68ca69fa6d18665d88d69ef8b7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jun 2011 00:45:44 +0100 Subject: AFS: Use i_generation not i_version for the vnode uniquifier Store the AFS vnode uniquifier in the i_generation field, not the i_version field of the inode struct. i_version can then be given the AFS data version number. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/dir.c | 8 ++++---- fs/afs/fsclient.c | 3 ++- fs/afs/inode.c | 10 +++++----- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 20c106f2492..1b0b1955001 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -584,11 +584,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", + _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, dentry->d_inode->i_ino, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); return NULL; } @@ -671,10 +671,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) * been deleted and replaced, and the original vnode ID has * been reused */ if (fid.unique != vnode->fid.unique) { - _debug("%s: file deleted (uq %u -> %u I:%llu)", + _debug("%s: file deleted (uq %u -> %u I:%u)", dentry->d_name.name, fid.unique, vnode->fid.unique, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4bd0218473a..346e3289abd 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -89,7 +89,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, i_size_write(&vnode->vfs_inode, size); vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_version = vnode->fid.unique; + vnode->vfs_inode.i_generation = vnode->fid.unique; vnode->vfs_inode.i_nlink = status->nlink; mode = vnode->vfs_inode.i_mode; @@ -102,6 +102,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_version = data_version; } expected_version = status->data_version; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index db66c520147..0fdab6e03d8 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -75,7 +75,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_version = vnode->fid.unique; + inode->i_generation = vnode->fid.unique; + inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; /* check to see whether a symbolic link is really a mountpoint */ @@ -100,7 +101,7 @@ static int afs_iget5_test(struct inode *inode, void *opaque) struct afs_iget_data *data = opaque; return inode->i_ino == data->fid.vnode && - inode->i_version == data->fid.unique; + inode->i_generation == data->fid.unique; } /* @@ -122,7 +123,7 @@ static int afs_iget5_set(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); inode->i_ino = data->fid.vnode; - inode->i_version = data->fid.unique; + inode->i_generation = data->fid.unique; vnode->fid = data->fid; vnode->volume = data->volume; @@ -380,8 +381,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, inode = dentry->d_inode; - _enter("{ ino=%lu v=%llu }", inode->i_ino, - (unsigned long long)inode->i_version); + _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); generic_fillattr(inode, stat); return 0; -- cgit v1.2.3-70-g09d2 From a27a263bae072a499acc77b632238a6dacccf888 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jun 2011 12:02:23 +0000 Subject: xfs: make log devices with write back caches work There's no reason not to support cache flushing on external log devices. The only thing this really requires is flushing the data device first both in fsync and log commits. A side effect is that we also have to remove the barrier write test during mount, which has been superflous since the new FLUSH+FUA code anyway. Also use the chance to flush the RT subvolume write cache before the fsync commit, which is required for correct semantics. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_file.c | 50 ++++++++++++++++++----------- fs/xfs/linux-2.6/xfs_super.c | 75 -------------------------------------------- fs/xfs/xfs_log.c | 11 ++++++- 3 files changed, 41 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index f4213ba1ff8..7f782af286b 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -131,19 +131,34 @@ xfs_file_fsync( { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = 0; int log_flushed = 0; trace_xfs_file_fsync(ip); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); xfs_ioend_wait(ip); + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the @@ -175,9 +190,9 @@ xfs_file_fsync( * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; @@ -209,28 +224,25 @@ xfs_file_fsync( * force the log. */ if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(ip->i_mount, + error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, XFS_LOG_SYNC, &log_flushed); } xfs_iunlock(ip, XFS_ILOCK_SHARED); } - if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); return -error; } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1e3a7ce804d..a1a881e68a9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -627,68 +627,6 @@ xfs_blkdev_put( blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -/* - * Try to write out the superblock using barriers. - */ -STATIC int -xfs_barrier_test( - xfs_mount_t *mp) -{ - xfs_buf_t *sbp = xfs_getsb(mp, 0); - int error; - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - XFS_BUF_ORDERED(sbp); - - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - - /* - * Clear all the flags we set and possible error state in the - * buffer. We only did the write to try out whether barriers - * worked and shouldn't leave any traces in the superblock - * buffer. - */ - XFS_BUF_DONE(sbp); - XFS_BUF_ERROR(sbp, 0); - XFS_BUF_UNORDERED(sbp); - - xfs_buf_relse(sbp); - return error; -} - -STATIC void -xfs_mountfs_check_barriers(xfs_mount_t *mp) -{ - int error; - - if (mp->m_logdev_targp != mp->m_ddev_targp) { - xfs_notice(mp, - "Disabling barriers, not supported with external log device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { - xfs_notice(mp, - "Disabling barriers, underlying device is readonly"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - error = xfs_barrier_test(mp); - if (error) { - xfs_notice(mp, - "Disabling barriers, trial barrier write failed"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } -} - void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) @@ -1240,14 +1178,6 @@ xfs_fs_remount( switch (token) { case Opt_barrier: mp->m_flags |= XFS_MOUNT_BARRIER; - - /* - * Test if barriers are actually working if we can, - * else delay this check until the filesystem is - * marked writeable. - */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) - xfs_mountfs_check_barriers(mp); break; case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; @@ -1282,8 +1212,6 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { mp->m_flags &= ~XFS_MOUNT_RDONLY; - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); /* * If this is the first remount to writeable state we @@ -1465,9 +1393,6 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); - error = xfs_filestream_mount(mp); if (error) goto out_free_sb; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 211930246f2..41d5b8f2bf9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1372,8 +1372,17 @@ xlog_sync(xlog_t *log, XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_LOG_BUFFER; - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an external log device, flush the data device + * before flushing the log to make sure all meta data + * written back from the AIL actually made it to disk + * before writing out the new log tail LSN in the log buffer. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); XFS_BUF_ORDERED(bp); + } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); -- cgit v1.2.3-70-g09d2 From ee7b75fc4f3ae49e1f25bf56219bb5de3c29afaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 16 Jun 2011 13:15:41 -0400 Subject: NFSv4: Fix a readdir regression Commit 7ebb9315 (NFS: use secinfo when crossing mountpoints) introduces a regression when decoding an NFSv4 readdir entry that sets the rdattr_error field. By treating the resulting value as if it is a decoding error, the current code may cause us to skip valid readdir entries. Reported-by: Andy Adamson Cc: stable@kernel.org [2.6.39] Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c4b7d6c0494..5db44a8dddf 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3098,7 +3098,7 @@ out_overflow: return -EIO; } -static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) { __be32 *p; @@ -3109,7 +3109,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) if (unlikely(!p)) goto out_overflow; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; - return -be32_to_cpup(p); + *res = -be32_to_cpup(p); } return 0; out_overflow: @@ -4070,6 +4070,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, int status; umode_t fmode = 0; uint32_t type; + int32_t err; status = decode_attr_type(xdr, bitmap, &type); if (status < 0) @@ -4095,13 +4096,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_error(xdr, bitmap); - if (status == -NFS4ERR_WRONGSEC) { - nfs_fixup_secinfo_attributes(fattr, fh); - status = 0; - } + err = 0; + status = decode_attr_error(xdr, bitmap, &err); if (status < 0) goto xdr_error; + if (err == -NFS4ERR_WRONGSEC) + nfs_fixup_secinfo_attributes(fattr, fh); status = decode_attr_filehandle(xdr, bitmap, fh); if (status < 0) -- cgit v1.2.3-70-g09d2 From 879669961b11e7f40b518784863a259f735a72bf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2011 11:25:59 +0100 Subject: KEYS/DNS: Fix ____call_usermodehelper() to not lose the session keyring ____call_usermodehelper() now erases any credentials set by the subprocess_inf::init() function. The problem is that commit 17f60a7da150 ("capabilites: allow the application of capability limits to usermode helpers") creates and commits new credentials with prepare_kernel_cred() after the call to the init() function. This wipes all keyrings after umh_keys_init() is called. The best way to deal with this is to put the init() call just prior to the commit_creds() call, and pass the cred pointer to init(). That means that umh_keys_init() and suchlike can modify the credentials _before_ they are published and potentially in use by the rest of the system. This prevents request_key() from working as it is prevented from passing the session keyring it set up with the authorisation token to /sbin/request-key, and so the latter can't assume the authority to instantiate the key. This causes the in-kernel DNS resolver to fail with ENOKEY unconditionally. Signed-off-by: David Howells Acked-by: Eric Paris Tested-by: Jeff Layton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/kmod.h | 8 ++++---- kernel/kmod.c | 16 +++++++++------- security/keys/request_key.c | 3 +-- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 97e0d52d72f..6075a1e727a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1996,7 +1996,7 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info) +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *rp, *wp; struct fdtable *fdt; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index d4a5c84c503..0da38cf7db7 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -45,7 +45,7 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; #endif -struct key; +struct cred; struct file; enum umh_wait { @@ -62,7 +62,7 @@ struct subprocess_info { char **envp; enum umh_wait wait; int retval; - int (*init)(struct subprocess_info *info); + int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; }; @@ -73,7 +73,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, /* Set various pieces of state into the subprocess_info structure */ void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data); @@ -87,7 +87,7 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info); static inline int call_usermodehelper_fns(char *path, char **argv, char **envp, enum umh_wait wait, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data) { struct subprocess_info *info; diff --git a/kernel/kmod.c b/kernel/kmod.c index ad6a81c58b4..47613dfb7b2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -156,12 +156,6 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - if (sub_info->init) { - retval = sub_info->init(sub_info); - if (retval) - goto fail; - } - retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) @@ -173,6 +167,14 @@ static int ____call_usermodehelper(void *data) new->cap_inheritable); spin_unlock(&umh_sysctl_lock); + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto fail; + } + } + commit_creds(new); retval = kernel_execve(sub_info->path, @@ -388,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * context in which call_usermodehelper_exec is called. */ void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { diff --git a/security/keys/request_key.c b/security/keys/request_key.c index d31862e0aa1..8e319a416ee 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -71,9 +71,8 @@ EXPORT_SYMBOL(complete_request_key); * This is called in context of freshly forked kthread before kernel_execve(), * so we can simply install the desired session_keyring at this point. */ -static int umh_keys_init(struct subprocess_info *info) +static int umh_keys_init(struct subprocess_info *info, struct cred *cred) { - struct cred *cred = (struct cred*)current_cred(); struct key *keyring = info->data; return install_session_keyring_to_cred(cred, keyring); -- cgit v1.2.3-70-g09d2 From 7585717f304f5ed005cc4ad933a69aab3efbd136 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 20:00:16 -0400 Subject: Btrfs: fix relocation races The recent commit to get rid of our trans_mutex introduced some races with block group relocation. The problem is that relocation needs to do some record keeping about each root, and it was relying on the transaction mutex to coordinate things in subtle ways. This fix adds a mutex just for the relocation code and makes sure it doesn't have a big impact on normal operations. The race is really fixed in btrfs_record_root_in_trans, which is where we step back and wait for the relocation code to finish accounting setup. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 ++++++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/relocation.c | 30 ++++++++++++++------- fs/btrfs/transaction.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 105 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8490ee06370..a2c91a102b7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -967,6 +967,12 @@ struct btrfs_fs_info { struct srcu_struct subvol_srcu; spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -1172,6 +1178,14 @@ struct btrfs_root { u32 type; u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; int ref_cows; int track_dirty; int in_radix; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20c111b3fa0..0b2b4b75913 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1620,6 +1620,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f25b10a22a0..086b1e6b861 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1368,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return 0; + goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -1390,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, root->fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); + +out: return 0; } @@ -2142,10 +2144,11 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - spin_lock(&root->fs_info->trans_lock); + mutex_lock(&root->fs_info->reloc_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2214,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - spin_lock(&root->fs_info->trans_lock); + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; @@ -3590,17 +3600,19 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b9fe9..833996a0c62 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, +static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + /* + * see below for in_trans_setup usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + root->in_trans_setup = 1; + + /* make sure readers find in_trans_setup before + * they find our root->last_trans update + */ + smp_wmb(); + spin_lock(&root->fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } - root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root->in_trans_setup. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ btrfs_init_reloc_root(trans, root); + smp_wmb(); + root->in_trans_setup = 0; } return 0; } + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + /* + * see record_root_in_trans for comments about in_trans_setup usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !root->in_trans_setup) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -882,7 +939,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - btrfs_record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -900,7 +957,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - btrfs_record_root_in_trans(trans, root); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1247,6 +1304,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); @@ -1312,6 +1376,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); wake_up(&root->fs_info->transaction_wait); -- cgit v1.2.3-70-g09d2 From 3ed4498caf381a73d6259d3ffacc914b17a507ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 13 Jun 2011 17:54:22 +0000 Subject: btrfs: fix dereference of ERR_PTR value smatch reports: btrfs_recover_log_trees error: 'wc.replay_dest' dereferencing possible ERR_PTR() Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 592396c6dc4..4ce8a9f41d1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3177,7 +3177,7 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(!wc.replay_dest); + BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); -- cgit v1.2.3-70-g09d2 From 9fe6a50fb764f508dd2de47a66e62e51388791fb Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 16 Jun 2011 09:04:57 +0000 Subject: btrfs: Remove unused sysfs code Removes code no longer used. The sysfs file itself is kept, because the btrfs developers expressed interest in putting new entries to sysfs. Signed-off-by: Maarten Lankhorst Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - fs/btrfs/sysfs.c | 146 ----------------------------------------------------- 3 files changed, 148 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a2c91a102b7..8e948ec1ee6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1195,7 +1195,6 @@ struct btrfs_root { struct btrfs_key defrag_max; int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0b2b4b75913..c25ef5a0ccd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,7 +1044,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_trans = 0; root->highest_objectid = 0; root->name = NULL; - root->in_sysfs = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c3c223ae669..daac9ae6d73 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -28,152 +28,6 @@ #include "disk-io.h" #include "transaction.h" -static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_used(&root->root_item)); -} - -static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_limit(&root->root_item)); -} - -static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) -{ - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); -} - -static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); -} - -static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); -} - -/* this is for root attrs (subvols/snapshots) */ -struct btrfs_root_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_root *, char *); - ssize_t (*store)(struct btrfs_root *, const char *, size_t); -}; - -#define ROOT_ATTR(name, mode, show, store) \ -static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ - show, store) - -ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); -ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); - -static struct attribute *btrfs_root_attrs[] = { - &btrfs_root_attr_blocks_used.attr, - &btrfs_root_attr_block_limit.attr, - NULL, -}; - -/* this is for super attrs (actual full fs) */ -struct btrfs_super_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_fs_info *, char *); - ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); -}; - -#define SUPER_ATTR(name, mode, show, store) \ -static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ - show, store) - -SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); -SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); -SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); - -static struct attribute *btrfs_super_attrs[] = { - &btrfs_super_attr_blocks_used.attr, - &btrfs_super_attr_total_blocks.attr, - &btrfs_super_attr_blocksize.attr, - NULL, -}; - -static ssize_t btrfs_super_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->show ? a->show(fs, buf) : 0; -} - -static ssize_t btrfs_super_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->store ? a->store(fs, buf, len) : 0; -} - -static ssize_t btrfs_root_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - - return a->show ? a->show(root, buf) : 0; -} - -static ssize_t btrfs_root_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - return a->store ? a->store(root, buf, len) : 0; -} - -static void btrfs_super_release(struct kobject *kobj) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - complete(&fs->kobj_unregister); -} - -static void btrfs_root_release(struct kobject *kobj) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - complete(&root->kobj_unregister); -} - -static const struct sysfs_ops btrfs_super_attr_ops = { - .show = btrfs_super_attr_show, - .store = btrfs_super_attr_store, -}; - -static const struct sysfs_ops btrfs_root_attr_ops = { - .show = btrfs_root_attr_show, - .store = btrfs_root_attr_store, -}; - /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; -- cgit v1.2.3-70-g09d2 From 19fd294957e426bfdd8e19085096467ec18df5c4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jun 2011 10:47:30 +0000 Subject: btrfs: fix wrong reservation when doing delayed inode operations We have migrated the space for the delayed inode items from trans_block_rsv to global_block_rsv, but we forgot to set trans->block_rsv to global_block_rsv when we doing delayed inode operations, and the following Oops happened: [ 9792.654889] ------------[ cut here ]------------ [ 9792.654898] WARNING: at fs/btrfs/extent-tree.c:5681 btrfs_alloc_free_block+0xca/0x27c [btrfs]() [ 9792.654899] Hardware name: To Be Filled By O.E.M. [ 9792.654900] Modules linked in: btrfs zlib_deflate libcrc32c ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables arc4 rt61pci rt2x00pci rt2x00lib snd_hda_codec_hdmi mac80211 snd_hda_codec_realtek cfg80211 snd_hda_intel edac_core snd_seq rfkill pcspkr serio_raw snd_hda_codec eeprom_93cx6 edac_mce_amd sp5100_tco i2c_piix4 k10temp snd_hwdep snd_seq_device snd_pcm floppy r8169 xhci_hcd mii snd_timer snd soundcore snd_page_alloc ipv6 firewire_ohci pata_acpi ata_generic firewire_core pata_via crc_itu_t radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core [last unloaded: scsi_wait_scan] [ 9792.654919] Pid: 2762, comm: rm Tainted: G W 2.6.39+ #1 [ 9792.654920] Call Trace: [ 9792.654922] [] warn_slowpath_common+0x83/0x9b [ 9792.654925] [] warn_slowpath_null+0x1a/0x1c [ 9792.654933] [] btrfs_alloc_free_block+0xca/0x27c [btrfs] [ 9792.654945] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.654953] [] __btrfs_cow_block+0xfc/0x30c [btrfs] [ 9792.654963] [] ? btrfs_buffer_uptodate+0x47/0x58 [btrfs] [ 9792.654970] [] ? read_block_for_search+0x94/0x368 [btrfs] [ 9792.654978] [] btrfs_cow_block+0xfe/0x146 [btrfs] [ 9792.654986] [] btrfs_search_slot+0x14d/0x4b6 [btrfs] [ 9792.654997] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.655022] [] btrfs_lookup_inode+0x2f/0x8f [btrfs] [ 9792.655025] [] ? _cond_resched+0xe/0x22 [ 9792.655027] [] ? mutex_lock+0x29/0x50 [ 9792.655039] [] btrfs_update_delayed_inode+0x72/0x137 [btrfs] [ 9792.655051] [] btrfs_run_delayed_items+0x90/0xdb [btrfs] [ 9792.655062] [] btrfs_commit_transaction+0x228/0x654 [btrfs] [ 9792.655064] [] ? remove_wait_queue+0x3a/0x3a [ 9792.655075] [] btrfs_evict_inode+0x14d/0x202 [btrfs] [ 9792.655077] [] evict+0x71/0x111 [ 9792.655079] [] iput+0x12a/0x132 [ 9792.655081] [] do_unlinkat+0x106/0x155 [ 9792.655083] [] ? path_put+0x1f/0x23 [ 9792.655085] [] ? audit_syscall_entry+0x145/0x171 [ 9792.655087] [] ? putname+0x34/0x36 [ 9792.655090] [] sys_unlinkat+0x29/0x2b [ 9792.655092] [] system_call_fastpath+0x16/0x1b [ 9792.655093] ---[ end trace 02b696eb02b3f768 ]--- This patch fix it by setting the reservation of the transaction handle to the correct one. Reported-by: Josef Bacik Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 25 ++++++++++++++++++++----- fs/btrfs/delayed-inode.h | 1 - 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6462c29d2d3..fc515b787e8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -297,7 +297,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->data_len = data_len; item->ins_or_del = 0; item->bytes_reserved = 0; - item->block_rsv = NULL; item->delayed_node = NULL; atomic_set(&item->refs, 1); } @@ -593,10 +592,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { + if (!ret) item->bytes_reserved = num_bytes; - item->block_rsv = dst_rsv; - } return ret; } @@ -604,10 +601,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { + struct btrfs_block_rsv *rsv; + if (!item->bytes_reserved) return; - btrfs_block_rsv_release(root, item->block_rsv, + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -1014,6 +1014,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret = 0; path = btrfs_alloc_path(); @@ -1021,6 +1022,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); @@ -1045,6 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1052,6 +1057,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; path = btrfs_alloc_path(); @@ -1059,6 +1065,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, node->root, node); @@ -1066,6 +1075,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_update_delayed_inode(trans, node->root, path, node); btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1116,6 +1126,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1134,6 +1145,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) if (IS_ERR(trans)) goto free_path; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, root, @@ -1176,6 +1190,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) nr = trans->blocks_used; + trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); __btrfs_btree_balance_dirty(root, nr); free_path: diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index eb7d240aa64..cb79b6771e8 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -75,7 +75,6 @@ struct btrfs_delayed_item { struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; - struct btrfs_block_rsv *block_rsv; struct btrfs_delayed_node *delayed_node; atomic_t refs; int ins_or_del; -- cgit v1.2.3-70-g09d2 From 35a30d7ce54e087d8025a725d4e5a2fdee723a9f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 13 Jun 2011 15:18:23 +0000 Subject: btrfs: fix uninitialized return value When allocation fails in btrfs_read_fs_root_no_name, ret is not set although it is returned, holding a garbage value. Signed-off-by: David Sterba Reviewed-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c25ef5a0ccd..1ac8db5dc0a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1299,12 +1299,12 @@ again: return root; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - if (!root->free_ino_ctl) - goto fail; root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), GFP_NOFS); - if (!root->free_ino_pinned) + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; goto fail; + } btrfs_init_free_ino_ctl(root); mutex_init(&root->fs_commit_mutex); -- cgit v1.2.3-70-g09d2 From e999376f094162aa425ae749aa1df95ab928d010 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 17 Jun 2011 16:14:09 -0400 Subject: Btrfs: avoid delayed metadata items during commits Snapshot creation has two phases. One is the initial snapshot setup, and the second is done during commit, while nobody is allowed to modify the root we are snapshotting. The delayed metadata insertion code can break that rule, it does a delayed inode update on the inode of the parent of the snapshot, and delayed directory item insertion. This makes sure to run the pending delayed operations before we record the snapshot root, which avoids corruptions. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 7 +++++++ fs/btrfs/delayed-inode.h | 4 ++++ fs/btrfs/transaction.c | 27 +++++++++++++++++---------- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc515b787e8..f1cbd028f7b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1237,6 +1237,13 @@ again: return 0; } +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index cb79b6771e8..d1a6a2915c6 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,4 +137,8 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, /* for init */ int __init btrfs_delayed_inode_init(void); void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c073d85e14f..51dcec86757 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -957,6 +957,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); @@ -1018,14 +1027,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, int ret; list_for_each_entry(pending, head, list) { - /* - * We must deal with the delayed items before creating - * snapshots, or we will create a snapthot with inconsistent - * information. - */ - ret = btrfs_run_delayed_items(trans, fs_info->fs_root); - BUG_ON(ret); - ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } @@ -1319,15 +1320,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_lock(&root->fs_info->reloc_mutex); - ret = create_pending_snapshots(trans, root->fs_info); + ret = btrfs_run_delayed_items(trans, root); BUG_ON(ret); - ret = btrfs_run_delayed_items(trans, root); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + WARN_ON(cur_trans != trans->transaction); btrfs_scrub_pause(root); -- cgit v1.2.3-70-g09d2 From c11760c6d80ab6aa20e383cf378a7287305f591c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jun 2011 10:30:03 -0700 Subject: isofs: fix bh leak in isofs_fill_super() error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In isofs_fill_super(), when an iso_primary_descriptor is found, it is kept in pri_bh. The error cases don't properly release it. Fix it. Reported-and-tested-by: 김원석 Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3db5ba4568f..b3cc8586984 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -974,7 +974,7 @@ out_no_inode: out_no_read: printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); - goto out_freesbi; + goto out_freebh; out_bad_zone_size: printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); @@ -989,6 +989,7 @@ out_unknown_format: out_freebh: brelse(bh); + brelse(pri_bh); out_freesbi: kfree(opt.iocharset); kfree(sbi); -- cgit v1.2.3-70-g09d2 From df18d127f4fed7a0284bcfa8d2843800cdb63b72 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 17 Jun 2011 16:25:51 -0400 Subject: pnfs-obj: No longer needed to take an extra ref at add_device Andy's last device_cache patches, already take an extra reference on the newly inserted device_id. So we can remove it from obj-io. Without this patch the device_ids are leaked. Andy's patches are not in Linus tree yet. So I'm not sure if they are scheduled for this Kernel or the next. This patch should be added as part of these. CC: Andy Adamson Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 9cf208df1f2..eb4aafa9f52 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -108,7 +108,6 @@ _dev_list_add(const struct nfs_server *nfss, de = n; } - atomic_inc(&de->id_node.ref); return de; } -- cgit v1.2.3-70-g09d2 From 105f4622104848ff1ee1f644d661bef9dec3eb27 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 7 Jun 2011 11:50:23 -0400 Subject: nfsd4: fix break_lease flags on nfsd open Thanks to Casey Bodley for pointing out that on a read open we pass 0, instead of O_RDONLY, to break_lease, with the result that a read open is treated like a write open for the purposes of lease breaking! Reported-by: Casey Bodley Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f3fb61b69a1..fd0acca5370 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -696,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} /* * Open an existing file or directory. @@ -744,12 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!inode->i_fop) goto out; - /* - * Check to see if there are any leases on this file. - * This may block while leases are broken. - */ - if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); + host_err = nfsd_open_break_lease(inode, access); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; -- cgit v1.2.3-70-g09d2 From 185bf87393afe6b966881e36c459949d90930a7a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 20 Jun 2011 10:10:24 +0300 Subject: ubifs: dereferencing an ERR_PTR in ubifs_mount() d251ed271d5 "ubifs: fix sget races" left out the goto from this error path so the static checkers complain that we're dereferencing "sb" when it's an ERR_PTR. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/ubifs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8c892c2d530..529be058202 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2146,6 +2146,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); + goto out_close; } if (sb->s_root) { -- cgit v1.2.3-70-g09d2 From 1712c20dae7b770b62b2e3272100b3b40af0157c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 19:59:04 -0400 Subject: bad_inode_permission() is safe from RCU mode return -EIO; is *not* a blocking operation, thank you very much. Nick, what the hell have you been smoking? Signed-off-by: Al Viro --- fs/bad_inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9ad2369d9e3..bfcb18feb1d 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -231,9 +231,6 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return -EIO; } -- cgit v1.2.3-70-g09d2 From ec12781f192568f7ea860f440f890389ba393df7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:03:36 -0400 Subject: cifs_permission() doesn't need to bail out in RCU mode nothing potentially blocking except generic_permission(), which will DTRT Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e9def996e38..2f0c58646c1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -257,9 +257,6 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { -- cgit v1.2.3-70-g09d2 From 6b419951f1e44c8a46fccfc6551eca9a9980acd6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:11:43 -0400 Subject: coda_ioctl_permission() is safe in RCU mode return (mask & MAY_EXEC) ? -EACCES : 0; is non-blocking... Signed-off-by: Al Viro --- fs/coda/pioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 6cbb3afb36d..cb140ef293e 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -43,8 +43,6 @@ const struct file_operations coda_ioctl_operations = { /* the coda pioctl inode ops */ static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } -- cgit v1.2.3-70-g09d2 From a63ab94d67879bc0630ea9821c582ddf58ba5527 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:17:22 -0400 Subject: logfs doesn't need ->permission() at all ... and never did, what with its ->permission() being what we do by default when ->permission is NULL... Signed-off-by: Al Viro --- fs/logfs/dir.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9ed89d1663f..1afae26cf23 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,13 +555,6 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask, unsigned int flags) -{ - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return generic_permission(inode, mask, flags, NULL); -} - static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -820,7 +813,6 @@ const struct inode_operations logfs_dir_iops = { .mknod = logfs_mknod, .rename = logfs_rename, .rmdir = logfs_rmdir, - .permission = logfs_permission, .symlink = logfs_symlink, .unlink = logfs_unlink, }; -- cgit v1.2.3-70-g09d2 From 730e908f3539066d4aa01f4720ebfc750ce4d045 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:21:44 -0400 Subject: nilfs2_permission() doesn't need to bail out in RCU mode Nothing blocking except for generic_permission(). Which will DTRT. Signed-off-by: Al Viro --- fs/nilfs2/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b954878ad6c..b9b45fc2903 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -801,12 +801,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - - root = NILFS_I(inode)->i_root; + struct nilfs_root *root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ -- cgit v1.2.3-70-g09d2 From cf1279111686d9742cbc4145bc9d526c83f59fea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:35:23 -0400 Subject: proc_fd_permission() is doesn't need to bail out in RCU mode nothing blocking except generic_permission() Signed-off-by: Al Viro --- fs/proc/base.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 14def991d9d..8a84210ca08 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = { */ static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { - int rv; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) -- cgit v1.2.3-70-g09d2 From 1d29b5a2ed7eb8862c9b66daf475f3e4c1a40299 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:37:33 -0400 Subject: reiserfs_permission() doesn't need to bail out in RCU mode nothing blocking other than generic_permission() (and check_acl callback does bail out in RCU mode). Signed-off-by: Al Viro --- fs/reiserfs/xattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e8a62f41b45..d7808969096 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -954,8 +954,6 @@ static int xattr_mount_check(struct super_block *s) int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. -- cgit v1.2.3-70-g09d2 From 1aec7036d0c2996c86ce483ca0a28f3b20807b43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:42:00 -0400 Subject: proc_sys_permission() is OK in RCU mode nothing blocking there, since all instances of sysctl ->permissions() method are non-blocking - both of them, that is. Signed-off-by: Al Viro --- fs/proc/proc_sysctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f50133c11c2..d167de365a8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) struct ctl_table *table; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; -- cgit v1.2.3-70-g09d2 From 6291176bcd71a2766a19a10cbd9bab07d289e1d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 19:20:48 -0400 Subject: kill obsolete comment for follow_down() Signed-off-by: Al Viro --- fs/namei.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9e425e7e6c8..975c40620fe 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1011,9 +1011,6 @@ failed: * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. - * - * Care must be taken as namespace_sem may be held (indicated by mounting_here - * being true). */ int follow_down(struct path *path) { -- cgit v1.2.3-70-g09d2 From 8e833fd2e1f0107ee7a4b6bc4de3c9f0e9b0ed41 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 01:56:53 -0400 Subject: fix comment in generic_permission() CAP_DAC_OVERRIDE is enough for MAY_EXEC on directory, even if no exec bits are set. Signed-off-by: Al Viro --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 975c40620fe..0223c41fb11 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -238,7 +238,8 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, /* * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * Executable DACs are overridable for all directories and + * for non-directories that have least one exec bit set. */ if (!(mask & MAY_EXEC) || execute_ok(inode)) if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) -- cgit v1.2.3-70-g09d2 From 206b6310fd0268a6ca50cf36f03b0f4eee5602ec Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 20 Jun 2011 10:30:04 -0500 Subject: jfs: old_agsize should be 64 bits in jfs_extendfs Signed-off-by: Dave Kleikamp --- fs/jfs/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 8ea5efb5a34..8d0c1c7c082 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -80,7 +80,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) int log_formatted = 0; struct inode *iplist[1]; struct jfs_superblock *j_sb, *j_sb2; - uint old_agsize; + s64 old_agsize; int agsizechanged = 0; struct buffer_head *bh, *bh2; -- cgit v1.2.3-70-g09d2 From 28e0fa894cd5996d3007ce82f07226f79beb7286 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 20 Jun 2011 10:32:46 -0500 Subject: jfs: Update agstart when resizing volume A comment indicates that the IAG's agstart does not need to be updated since it will always point to a block in the same aggregate group, but jfs_fsck isn't so forgiving and reports it as an error. I'm fixing this in jfsutils as well, so either a new kernel or new utilities will be sufficient to fix the problem. Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_imap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index ed53a474016..0533e8f3d19 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2921,10 +2921,9 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) continue; } - /* agstart that computes to the same ag is treated as same; */ agstart = le64_to_cpu(iagp->agstart); - /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */ n = agstart >> mp->db_agl2size; + iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); /* compute backed inodes */ numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) -- cgit v1.2.3-70-g09d2 From d31b53e3cd069e02290ed8a648aa8c7618d6fe77 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 20 Jun 2011 10:53:46 -0500 Subject: JFS: Don't save agno in the inode Resizing the file system can result in an in-memory inode being remapped to a different aggregate group (AG). A cached AG number can cause problems when trying to free or allocate inodes. Instead, save the IAG's agstart address and calculate the agno when we need it. Signed-off-by: Dave Kleikamp --- fs/jfs/file.c | 6 +++--- fs/jfs/jfs_imap.c | 9 ++++----- fs/jfs/jfs_incore.h | 3 ++- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jfs/file.c b/fs/jfs/file.c index c5ce6c1d1ff..2f3f531f360 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -66,9 +66,9 @@ static int jfs_open(struct inode *inode, struct file *file) struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag == -1) { - ji->active_ag = ji->agno; - atomic_inc( - &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]); + struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); + ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); + atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 0533e8f3d19..b78b2f978f0 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -397,7 +397,7 @@ int diRead(struct inode *ip) release_metapage(mp); /* set the ag for the inode */ - JFS_IP(ip)->agno = BLKTOAG(agstart, sbi); + JFS_IP(ip)->agstart = agstart; JFS_IP(ip)->active_ag = -1; return (rc); @@ -901,7 +901,7 @@ int diFree(struct inode *ip) /* get the allocation group for this ino. */ - agno = JFS_IP(ip)->agno; + agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); /* Lock the AG specific inode map information */ @@ -1315,12 +1315,11 @@ int diFree(struct inode *ip) static inline void diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) { - struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); ip->i_ino = (iagno << L2INOSPERIAG) + ino; jfs_ip->ixpxd = iagp->inoext[extno]; - jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + jfs_ip->agstart = le64_to_cpu(iagp->agstart); jfs_ip->active_ag = -1; } @@ -1379,7 +1378,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) */ /* get the ag number of this iag */ - agno = JFS_IP(pip)->agno; + agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { /* diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 1439f119ec8..5097839b770 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -50,8 +50,9 @@ struct jfs_inode_info { short btindex; /* btpage entry index*/ struct inode *ipimap; /* inode map */ unsigned long cflag; /* commit flags */ + long agstart; /* agstart of the containing IAG */ u16 bxflag; /* xflag of pseudo buffer? */ - unchar agno; /* ag number */ + unchar pad; signed char active_ag; /* ag currently allocating from */ lid_t blid; /* lid of pseudo buffer? */ lid_t atlhead; /* anonymous tlock list head */ -- cgit v1.2.3-70-g09d2 From 19345cb299e8234006c5125151ab723e851a1d24 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jun 2011 18:33:46 -0400 Subject: NFSv4.1: file layout must consider pg_bsize for coalescing Otherwise we end up overflowing the rpc buffer size on the receive end. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 6 ++++-- fs/nfs/pagelist.c | 3 ++- include/linux/nfs_page.h | 3 +++ 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 5d6f369b15d..0bafcc91c27 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -30,6 +30,7 @@ */ #include +#include #include "internal.h" #include "nfs4filelayout.h" @@ -666,8 +667,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, u64 p_stripe, r_stripe; u32 stripe_unit; - if (!pnfs_generic_pg_test(pgio, prev, req)) - return 0; + if (!pnfs_generic_pg_test(pgio, prev, req) || + !nfs_generic_pg_test(pgio, prev, req)) + return false; if (!pgio->pg_lseg) return 1; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7913961aff2..00985571628 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -204,7 +204,7 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } -static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) { /* * FIXME: ideally we should be able to coalesce all requests @@ -218,6 +218,7 @@ static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_p return desc->pg_count + req->wb_bytes <= desc->pg_bsize; } +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); /** * nfs_pageio_init - initialise a page io descriptor diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 3a34e80ae92..25311b3bedf 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -92,6 +92,9 @@ extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc); extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t); +extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, + struct nfs_page *prev, + struct nfs_page *req); extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern int nfs_set_page_tag_locked(struct nfs_page *req); -- cgit v1.2.3-70-g09d2 From ecc90462b428db2ad2ee5081c45496ed10f3a633 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 20 Jun 2011 17:53:24 -0500 Subject: jfs: agstart field must be 64 bits The previous patch added the agstart field to jfs_ip, but declared it a long. We need to make sure its 64 bits on every platform. Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_incore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 5097839b770..584a4a1a6e8 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -50,7 +50,7 @@ struct jfs_inode_info { short btindex; /* btpage entry index*/ struct inode *ipimap; /* inode map */ unsigned long cflag; /* commit flags */ - long agstart; /* agstart of the containing IAG */ + u64 agstart; /* agstart of the containing IAG */ u16 bxflag; /* xflag of pseudo buffer? */ unchar pad; signed char active_ag; /* ag currently allocating from */ -- cgit v1.2.3-70-g09d2 From 8f7d5efbef8718a774ac5e347b4ec069f17fd9b4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 10 Jun 2011 13:30:22 -0400 Subject: NFSv4.1: Fix some issues with pnfs_generic_pg_test 1. If the intention is to coalesce requests 'prev' and 'req' then we have to ensure at least that we have a layout starting at req_offset(prev). 2. If we're only requesting a minimal layout of length desc->pg_count, we need to test the length actually returned by the server before we allow the coalescing to occur. 3. We need to deal correctly with (pgio->lseg == NULL) 4. Fixup the test guarding the pnfs_update_layout. Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 3 +++ fs/nfs/pnfs.c | 12 +++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index eb4aafa9f52..8ff2ea3f10e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1000,6 +1000,9 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, if (!pnfs_generic_pg_test(pgio, prev, req)) return false; + if (pgio->pg_lseg == NULL) + return true; + return pgio->pg_count + req->wb_bytes <= OBJIO_LSEG(pgio->pg_lseg)->max_io_size; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 539b94cb6c0..0f42e02436e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1064,19 +1064,21 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, gfp_flags = GFP_NOFS; } - if (pgio->pg_count == prev->wb_bytes) { + if (pgio->pg_lseg == NULL) { + if (pgio->pg_count != prev->wb_bytes) + return true; /* This is first coelesce call for a series of nfs_pages */ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, prev->wb_context, - req_offset(req), + req_offset(prev), pgio->pg_count, access_type, gfp_flags); - return true; + if (pgio->pg_lseg == NULL) + return true; } - if (pgio->pg_lseg && - req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset, + if (req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length)) return false; -- cgit v1.2.3-70-g09d2 From 19982ba8562e33083cb5bbb59a74855d8a9624ea Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 10 Jun 2011 13:30:23 -0400 Subject: NFSv4.1: Fix an off-by-one error in pnfs_generic_pg_test And document what is going on there... Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0f42e02436e..29c0ca7fc34 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1078,11 +1078,22 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return true; } - if (req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length)) - return false; - - return true; + /* + * Test if a nfs_page is fully contained in the pnfs_layout_range. + * Note that this test makes several assumptions: + * - that the previous nfs_page in the struct nfs_pageio_descriptor + * is known to lie within the range. + * - that the nfs_page being tested is known to be contiguous with the + * previous nfs_page. + * - Layout ranges are page aligned, so we only have to test the + * start offset of the request. + * + * Please also note that 'end_offset' is actually the offset of the + * first byte that lies outside the pnfs_layout_range. FIXME? + * + */ + return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); -- cgit v1.2.3-70-g09d2 From 1650add23578b5ca35c1f1e863987180a8c03779 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 2 Jun 2011 15:07:35 -0400 Subject: NFS: Fix decode_secinfo_maxsz I initially did the calculation in bytes, and not words Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5db44a8dddf..6870bc61cee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -255,7 +255,7 @@ static int nfs4_stat_to_errno(int); #define decode_fs_locations_maxsz \ (0) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) -#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN))) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) -- cgit v1.2.3-70-g09d2 From 446b23a75804d7ffa4cca2d4d8f0afb822108c7e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 20 Jun 2011 12:33:16 +0400 Subject: CIFS: Fix problem with 3.0-rc1 null user mount failure Figured it out: it was broken by b946845a9dc523c759cae2b6a0f6827486c3221a commit - "cifs: cifs_parse_mount_options: do not tokenize mount options in-place". So, as a quick fix I suggest to apply this patch. [PATCH] CIFS: Fix kfree() with constant string in a null user case Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 12cf72dd0c4..19fdbda7aa9 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2937,7 +2937,11 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, if (volume_info->nullauth) { cFYI(1, "null user"); - volume_info->username = ""; + volume_info->username = kzalloc(1, GFP_KERNEL); + if (volume_info->username == NULL) { + rc = -ENOMEM; + goto out; + } } else if (volume_info->username) { /* BB fixme parse for domain name here */ cFYI(1, "Username: %s", volume_info->username); -- cgit v1.2.3-70-g09d2 From 1190f6a067bf27b2ee7e06ec0776a17fe0f6c4d8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 22 Jun 2011 17:33:57 -0400 Subject: cifs: fix wsize negotiation to respect max buffer size and active signing (try #4) Hopefully last version. Base signing check on CAP_UNIX instead of tcon->unix_ext, also clean up the comments a bit more. According to Hongwei Sun's blog posting here: http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx CAP_LARGE_WRITEX is ignored when signing is active. Also, the maximum size for a write without CAP_LARGE_WRITEX should be the maxBuf that the server sent in the NEGOTIATE request. Fix the wsize negotiation to take this into account. While we're at it, alter the other wsize definitions to use sizeof(WRITE_REQ) to allow for slightly larger amounts of data to potentially be written per request. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 19fdbda7aa9..c761935eab8 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2750,21 +2750,21 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, /* * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24 - PAGE_CACHE_SIZE. + * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including + * the RFC1001 length. * * Note that this might make for "interesting" allocation problems during - * writeback however (as we have to allocate an array of pointers for the - * pages). A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * writeback however as we have to allocate an array of pointers for the + * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. */ -#define CIFS_MAX_WSIZE ((1<<24) - PAGE_CACHE_SIZE) +#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) /* - * When the server doesn't allow large posix writes, default to a wsize of - * 128k - PAGE_CACHE_SIZE -- one page less than the largest frame size - * described in RFC1001. This allows space for the header without going over - * that by default. + * When the server doesn't allow large posix writes, only allow a wsize of + * 128k minus the size of the WRITE_AND_X header. That allows for a write up + * to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1001_WSIZE (128 * 1024 - PAGE_CACHE_SIZE) +#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 @@ -2783,11 +2783,18 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) - wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1001_WSIZE); + wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); - /* no CAP_LARGE_WRITE_X? Limit it to 16 bits */ - if (!(server->capabilities & CAP_LARGE_WRITE_X)) - wsize = min_t(unsigned int, wsize, USHRT_MAX); + /* + * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? + * Limit it to max buffer offered by the server, minus the size of the + * WRITEX header, not including the 4 byte RFC1001 length. + */ + if (!(server->capabilities & CAP_LARGE_WRITE_X) || + (!(server->capabilities & CAP_UNIX) && + (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); /* hard limit of CIFS_MAX_WSIZE */ wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); -- cgit v1.2.3-70-g09d2 From 778e24bb6dd8682318bb496d4bfdc32b501a6420 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 23 Jun 2011 01:34:59 +0000 Subject: xfs: reset inode per-lifetime state when recycling it XFS inodes has several per-lifetime state fields that determine the behaviour of the inode. These state fields are not all reset when an inode is reused from the reclaimable state. This can lead to unexpected behaviour of the new inode such as speculative preallocation not being truncated away in the expected manner for local files until the inode is subsequently truncated, freed or cycles out of the cache. It can also lead to an inode being considered to be a filestream inode or having been truncated when that is not the case. Rework the reinitialisation of the inode when it is recycled to ensure that it is pristine before it is reused. While there, also fix the resetting of state flags in the recycling error paths so the inode does not become unreclaimable. Signed-off-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_iget.c | 13 +++++++++---- fs/xfs/xfs_inode.h | 10 ++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index cb9b6d1469f..3631783b2b5 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -253,16 +253,21 @@ xfs_iget_cache_hit( rcu_read_lock(); spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~XFS_INEW; - ip->i_flags |= XFS_IRECLAIMABLE; - __xfs_inode_set_reclaim_tag(pag, ip); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; } spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3ae6d58e547..964cfea7768 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -383,6 +383,16 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ +/* + * Per-lifetime flags need to be reset when re-using a reclaimable inode during + * inode lookup. Thi prevents unintended behaviour on the new inode from + * ocurring. + */ +#define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ + XFS_IFILESTREAM); + /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) -- cgit v1.2.3-70-g09d2 From df4368a146d2b350b8398babfe11e2088f741d67 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 23 Jun 2011 01:35:00 +0000 Subject: xfs: clear XFS_IDIRTY_RELEASE on truncate down When an inode is truncated down, speculative preallocation is removed from the inode. This should also reset the state bits for controlling whether preallocation is subsequently removed when the file is next closed. The flag is not being cleared, so repeated operations on a file that first involve a truncate (e.g. multiple repeated dd invocations on a file) give different file layouts for the second and subsequent invocations. Fix this by clearing the XFS_IDIRTY_RELEASE state bit when the XFS_ITRUNCATED bit is detected in xfs_release() and hence ensure that speculative delalloc is removed on files that have been truncated down. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_vnodeops.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b7a5fe7c52c..619720705bc 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -960,8 +960,11 @@ xfs_release( * be exposed to that problem. */ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); - if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) + xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + } } if (ip->i_d.di_nlink == 0) -- cgit v1.2.3-70-g09d2 From 4a33821236f2ef3af0081e8a5eec1301cbed3125 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 23 Jun 2011 01:35:01 +0000 Subject: xfs: prevent bogus assert when trying to remove non-existent attribute If the attribute fork on an inode is in btree format and has multiple levels (i.e node format rather than leaf format), then a lookup failure will trigger an assert failure in xfs_da_path_shift if the flag XFS_DA_OP_OKNOENT is not set. This flag is used to indicate to the directory btree code that not finding an entry is not a fatal error. In the case of doing a lookup for a directory name removal, this is valid as a user cannot insert an arbitrary name to remove from the directory btree. However, in the case of the attribute tree, a user has direct control over the attribute name and can ask for any random name to be removed without any validation. In this case, fsstress is asking for a non-existent user.selinux attribute to be removed, and that is causing xfs_da_path_shift() to fall off the bottom of the tree where it asserts that a lookup failure is allowed. Because the flag is not set, we die a horrible death on a debug enable kernel. Prevent this assert from firing on attribute removes by adding the op_flag XFS_DA_OP_OKNOENT to atribute removal operations. Discovered when testing on a SELinux enabled system by fsstress in test 070 by trying to remove a non-existent user.selinux attribute. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index c8637537881..01d2072fb6d 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -489,6 +489,13 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) args.total = 0; args.whichfork = XFS_ATTR_FORK; + /* + * we have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail. + */ + args.op_flags = XFS_DA_OP_OKNOENT; + /* * Attach the dquots to the inode. */ -- cgit v1.2.3-70-g09d2 From 46e4edbf7ea9cf26665eb9f90c0fc7688d1a51ed Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 23 Jun 2011 23:59:32 +0200 Subject: Remove unneeded version.h includes from fs/ It was pointed out by 'make versioncheck' that some includes of linux/version.h were not needed in fs/ (fs/btrfs/ctree.h and fs/omfs/file.c). This patch removes them. Signed-off-by: Jesper Juhl Acked-by: Bob Copeland Signed-off-by: Linus Torvalds --- fs/btrfs/ctree.h | 1 - fs/omfs/file.c | 1 - 2 files changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 300628795fd..f30ac05dbda 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,7 +19,6 @@ #ifndef __BTRFS_CTREE__ #define __BTRFS_CTREE__ -#include #include #include #include diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d738a7e493d..2c6d95257a4 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -4,7 +4,6 @@ * Released under GPL v2. */ -#include #include #include #include -- cgit v1.2.3-70-g09d2 From e4fb0edb7c03e5ec19b6f732f1dfbe911212dbde Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 20 Jun 2011 14:33:16 -0400 Subject: cifs: free blkcipher in smbhash This is currently leaked in the rc == 0 case. Reported-by: J. Bruce Fields Signed-off-by: Jeff Layton Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/smbencrypt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1525d5e662b..1c5b770c314 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -90,12 +90,10 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) sg_init_one(&sgout, out, 8); rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); - if (rc) { + if (rc) cERROR(1, "could not encrypt crypt key rc: %d\n", rc); - crypto_free_blkcipher(tfm_des); - goto smbhash_err; - } + crypto_free_blkcipher(tfm_des); smbhash_err: return rc; } -- cgit v1.2.3-70-g09d2 From 1973f0faeb4a5f35597793c65d3c94d8fd386e10 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Jun 2011 13:13:29 -0400 Subject: Btrfs: make sure to record the transid in new inodes When we create a new inode, we aren't filling in the field that records the transaction that last changed this inode. If we then go to fsync that inode, it will be skipped because the field isn't filled in. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5813dec5101..87f1e0cf26f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4520,6 +4520,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_tree_add(inode); trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, inode); return inode; fail: -- cgit v1.2.3-70-g09d2 From 9b8e072a31180eb5cd6991d08524d9c4fa235ade Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 21 Jun 2011 07:18:26 -0400 Subject: cifs: mark CONFIG_CIFS_NFSD_EXPORT as BROKEN This does not work properly with CIFS as current servers do not enable support for the FILE_OPEN_BY_FILE_ID on SMB NTCreateX and not all NFS clients handle ESTALE. For now, it just plain doesn't work. Mark it BROKEN to discourage distros from enabling it. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 53ed1ad2c11..f66cc162515 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -156,6 +156,6 @@ config CIFS_ACL config CIFS_NFSD_EXPORT bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL + depends on CIFS && EXPERIMENTAL && BROKEN help Allows NFS server to export a CIFS mounted share (nfsd over cifs) -- cgit v1.2.3-70-g09d2 From dd8544661947ad6d8d87b3c9d4333bfa1583d1bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 08:24:42 -0400 Subject: take bdi setup/destruction into cifs_mount/cifs_umount Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 12 +----------- fs/cifs/connect.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2f0c58646c1..5d3c4fa4b54 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,18 +116,12 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); - if (rc) - return rc; - - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; - rc = cifs_mount(sb, cifs_sb, volume_info, devname); if (rc) { if (!silent) cERROR(1, "cifs_mount failed w/return code = %d", rc); - goto out_mount_failed; + return rc; } sb->s_magic = CIFS_MAGIC_NUMBER; @@ -171,9 +165,6 @@ out_no_root: iput(inode); cifs_umount(sb, cifs_sb); - -out_mount_failed: - bdi_destroy(&cifs_sb->bdi); return rc; } @@ -199,7 +190,6 @@ cifs_put_super(struct super_block *sb) } unload_nls(cifs_sb->local_nls); - bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb); } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 12cf72dd0c4..78fd7557e35 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2983,6 +2983,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct tcon_link *tlink; #ifdef CONFIG_CIFS_DFS_UPCALL int referral_walks_count = 0; + + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + if (rc) + return rc; + + cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; + try_mount_again: /* cleanup activities if we're chasing a referral */ if (referral_walks_count) { @@ -3007,6 +3014,7 @@ try_mount_again: srvTcp = cifs_get_tcp_session(volume_info); if (IS_ERR(srvTcp)) { rc = PTR_ERR(srvTcp); + bdi_destroy(&cifs_sb->bdi); goto out; } @@ -3161,6 +3169,7 @@ mount_fail_check: cifs_put_smb_ses(pSesInfo); else cifs_put_tcp_session(srvTcp); + bdi_destroy(&cifs_sb->bdi); goto out; } @@ -3357,6 +3366,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); + bdi_destroy(&cifs_sb->bdi); return 0; } -- cgit v1.2.3-70-g09d2 From 6d6861757dfadb7d6aec6bb34acd471210a755f9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 08:34:57 -0400 Subject: cifs: double free on mount failure if we get to out_super with ->s_root already set (e.g. with cifs_get_root() failure), we'll end up with cifs_put_super() called and ->mountdata freed twice. We'll also get cifs_sb freed twice and cifs_sb->local_nls dropped twice. The problem is, we can get to out_super both with and without ->s_root, which makes ->put_super() a bad place for such work. Switch to ->kill_sb(), have all that work done there after kill_anon_super(). Unlike ->put_super(), ->kill_sb() is called by deactivate_locked_super() whether we have ->s_root or not. Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5d3c4fa4b54..dc76c7bccb1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -184,11 +184,13 @@ cifs_put_super(struct super_block *sb) rc = cifs_umount(sb, cifs_sb); if (rc) cERROR(1, "cifs_umount failed with return code %d", rc); - if (cifs_sb->mountdata) { - kfree(cifs_sb->mountdata); - cifs_sb->mountdata = NULL; - } +} +static void cifs_kill_sb(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + kill_anon_super(sb); + kfree(cifs_sb->mountdata); unload_nls(cifs_sb->local_nls); kfree(cifs_sb); } @@ -729,8 +731,8 @@ out_shared: goto out; out_super: - kfree(cifs_sb->mountdata); deactivate_locked_super(sb); + goto out; out_cifs_sb: unload_nls(cifs_sb->local_nls); @@ -827,7 +829,7 @@ struct file_system_type cifs_fs_type = { .owner = THIS_MODULE, .name = "cifs", .mount = cifs_do_mount, - .kill_sb = kill_anon_super, + .kill_sb = cifs_kill_sb, /* .fs_flags */ }; const struct inode_operations cifs_dir_inode_ops = { -- cgit v1.2.3-70-g09d2 From ca171baaad1420a29cca98be5bdf5596cd70b294 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 08:49:12 -0400 Subject: cifs: don't leak nls on mount failure if cifs_sb allocation fails, we still need to drop nls we'd stashed into volume_info - the one we would've copied to cifs_sb if we could allocate the latter. Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index dc76c7bccb1..bfab2bc8372 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -672,6 +672,7 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); if (cifs_sb == NULL) { root = ERR_PTR(-ENOMEM); + unload_nls(volume_info->local_nls); goto out; } -- cgit v1.2.3-70-g09d2 From 2c6292ae4be00454882246d07f38cdf15a823c2a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:05:48 -0400 Subject: cifs: don't pass superblock to cifs_mount() To close sget() races we'll need to be able to set cifs_sb up before we get the superblock, so we'll want to be able to do cifs_mount() earlier. Fortunately, it's easy to do - setting ->s_maxbytes can be done in cifs_read_super(), ditto for ->s_time_gran and as for putting MS_POSIXACL into ->s_flags, we can mirror it in ->mnt_cifs_flags until cifs_read_super() is called. Kill unused 'devname' argument, while we are at it... Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifs_fs_sb.h | 1 + fs/cifs/cifsfs.c | 13 ++++++++++++- fs/cifs/cifsproto.h | 6 +++--- fs/cifs/connect.c | 28 ++++++++++------------------ 4 files changed, 26 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index ffb1459dc6e..7260e11e21f 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -42,6 +42,7 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index bfab2bc8372..8f7451f3c8e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; - rc = cifs_mount(sb, cifs_sb, volume_info, devname); + rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!silent) @@ -124,6 +124,17 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, return rc; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) + sb->s_flags |= MS_POSIXACL; + + if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; + + /* BB FIXME fix time_gran to be larger for LANMAN sessions */ + sb->s_time_gran = 100; + sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; sb->s_bdi = &cifs_sb->bdi; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 953f84413c7..5814fe543f9 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -157,8 +157,7 @@ extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info); extern int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, const char *devname); -extern int cifs_mount(struct super_block *, struct cifs_sb_info *, - struct smb_vol *, const char *); +extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern int cifs_umount(struct super_block *, struct cifs_sb_info *); extern void cifs_dfs_release_automount_timer(void); void cifs_proc_init(void); @@ -218,7 +217,8 @@ extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo, struct dfs_info3_param **preferrals, int remap); extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, - struct super_block *sb, struct smb_vol *vol); + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol); extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData); extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 78fd7557e35..3011ac8c924 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2546,7 +2546,7 @@ ip_connect(struct TCP_Server_Info *server) } void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, - struct super_block *sb, struct smb_vol *vol_info) + struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info) { /* if we are reconnecting then should we check to see if * any requested capabilities changed locally e.g. via @@ -2600,22 +2600,23 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, cap &= ~CIFS_UNIX_POSIX_ACL_CAP; else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { cFYI(1, "negotiated posix acl support"); - if (sb) - sb->s_flags |= MS_POSIXACL; + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIXACL; } if (vol_info && vol_info->posix_paths == 0) cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { cFYI(1, "negotiate posix pathnames"); - if (sb) - CIFS_SB(sb)->mnt_cifs_flags |= + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; } - if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) { + if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - CIFS_SB(sb)->rsize = 127 * 1024; + cifs_sb->rsize = 127 * 1024; cFYI(DBG2, "larger reads not supported by srv"); } } @@ -2971,8 +2972,7 @@ out: } int -cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, - struct smb_vol *volume_info, const char *devname) +cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { int rc = 0; int xid; @@ -3026,14 +3026,6 @@ try_mount_again: goto mount_fail_check; } - if (pSesInfo->capabilities & CAP_LARGE_FILES) - sb->s_maxbytes = MAX_LFS_FILESIZE; - else - sb->s_maxbytes = MAX_NON_LFS; - - /* BB FIXME fix time_gran to be larger for LANMAN sessions */ - sb->s_time_gran = 100; - /* search for existing tcon to this server share */ tcon = cifs_get_tcon(pSesInfo, volume_info); if (IS_ERR(tcon)) { @@ -3046,7 +3038,7 @@ try_mount_again: if (tcon->ses->capabilities & CAP_UNIX) { /* reset of caps checks mount to see if unix extensions disabled for just this mount */ - reset_cifs_unix_caps(xid, tcon, sb, volume_info); + reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info); if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && (le64_to_cpu(tcon->fsUnixInfo.Capability) & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { -- cgit v1.2.3-70-g09d2 From d687ca380f1a8f3043f42efd2403cbe58c846e70 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:14:27 -0400 Subject: cifs: leak on mount if we share superblock cifs_sb and nls end up leaked... Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8f7451f3c8e..4162ee45d04 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -701,6 +701,8 @@ cifs_do_mount(struct file_system_type *fs_type, if (sb->s_fs_info) { cFYI(1, "Use existing superblock"); + unload_nls(cifs_sb->local_nls); + kfree(cifs_sb); goto out_shared; } -- cgit v1.2.3-70-g09d2 From 5d3bc605cafe3f367b1c43b673bf643245c81626 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:17:28 -0400 Subject: cifs: allocate mountdata earlier pull mountdata allocation up, so that it won't stand in the way when we lift cifs_mount() to location before sget(). Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 4162ee45d04..ec19161dd27 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -687,6 +687,14 @@ cifs_do_mount(struct file_system_type *fs_type, goto out; } + cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); + if (cifs_sb->mountdata == NULL) { + root = ERR_PTR(-ENOMEM); + unload_nls(volume_info->local_nls); + kfree(cifs_sb); + goto out; + } + cifs_setup_cifs_sb(volume_info, cifs_sb); mnt_data.vol = volume_info; @@ -701,22 +709,12 @@ cifs_do_mount(struct file_system_type *fs_type, if (sb->s_fs_info) { cFYI(1, "Use existing superblock"); + kfree(cifs_sb->mountdata); unload_nls(cifs_sb->local_nls); kfree(cifs_sb); goto out_shared; } - /* - * Copy mount params for use in submounts. Better to do - * the copy here and deal with the error before cleanup gets - * complicated post-mount. - */ - cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); - if (cifs_sb->mountdata == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_super; - } - sb->s_flags = flags; /* BB should we make this contingent on mount parm? */ sb->s_flags |= MS_NODIRATIME | MS_NOATIME; @@ -749,6 +747,7 @@ out_super: goto out; out_cifs_sb: + kfree(cifs_sb->mountdata); unload_nls(cifs_sb->local_nls); kfree(cifs_sb); -- cgit v1.2.3-70-g09d2 From 2ced6f693581357b2a5bf8b031a702c624b12d0d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:20:04 -0400 Subject: cifs: initialize ->tlink_tree in cifs_setup_cifs_sb() no need to wait until cifs_read_super() and we need it done by the time cifs_mount() will be called. Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 3 --- fs/cifs/connect.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ec19161dd27..61c7aa870f1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -113,9 +113,6 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, cifs_sb = CIFS_SB(sb); - spin_lock_init(&cifs_sb->tlink_tree_lock); - cifs_sb->tlink_tree = RB_ROOT; - rc = cifs_mount(cifs_sb, volume_info); if (rc) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3011ac8c924..9f09adf51ed 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2663,6 +2663,9 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + spin_lock_init(&cifs_sb->tlink_tree_lock); + cifs_sb->tlink_tree = RB_ROOT; + if (pvolume_info->rsize > CIFSMaxBufSize) { cERROR(1, "rsize %d too large, using MaxBufSize", pvolume_info->rsize); -- cgit v1.2.3-70-g09d2 From 2a9b99516c662d1713d58648e4a4c9aef72051bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:27:16 -0400 Subject: sanitize cifs_umount() prototype a) superblock argument is unused b) it always returns 0 Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 6 ++---- fs/cifs/cifsproto.h | 2 +- fs/cifs/connect.c | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 61c7aa870f1..2af14d4577a 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -172,7 +172,7 @@ out_no_root: if (inode) iput(inode); - cifs_umount(sb, cifs_sb); + cifs_umount(cifs_sb); return rc; } @@ -189,9 +189,7 @@ cifs_put_super(struct super_block *sb) return; } - rc = cifs_umount(sb, cifs_sb); - if (rc) - cERROR(1, "cifs_umount failed with return code %d", rc); + cifs_umount(cifs_sb); } static void cifs_kill_sb(struct super_block *sb) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5814fe543f9..257f312ede4 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -158,7 +158,7 @@ extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info); extern int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, const char *devname); extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); -extern int cifs_umount(struct super_block *, struct cifs_sb_info *); +extern void cifs_umount(struct cifs_sb_info *); extern void cifs_dfs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9f09adf51ed..b2702226634 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3339,8 +3339,8 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, return rc; } -int -cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) +void +cifs_umount(struct cifs_sb_info *cifs_sb) { struct rb_root *root = &cifs_sb->tlink_tree; struct rb_node *node; @@ -3362,7 +3362,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) spin_unlock(&cifs_sb->tlink_tree_lock); bdi_destroy(&cifs_sb->bdi); - return 0; } int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) -- cgit v1.2.3-70-g09d2 From 97d1152acec0647b72f8c6ecc57da0d6fed574de Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:29:57 -0400 Subject: cifs: pull cifs_mount() call up ... to the point prior to sget(). Now we have cifs_sb set up early enough. Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2af14d4577a..4004bc647a7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -104,8 +104,7 @@ cifs_sb_deactive(struct super_block *sb) } static int -cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, - const char *devname, int silent) +cifs_read_super(struct super_block *sb) { struct inode *inode; struct cifs_sb_info *cifs_sb; @@ -113,14 +112,6 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, cifs_sb = CIFS_SB(sb); - rc = cifs_mount(cifs_sb, volume_info); - - if (rc) { - if (!silent) - cERROR(1, "cifs_mount failed w/return code = %d", rc); - return rc; - } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) sb->s_flags |= MS_POSIXACL; @@ -692,6 +683,17 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_setup_cifs_sb(volume_info, cifs_sb); + rc = cifs_mount(cifs_sb, volume_info); + if (rc) { + if (!(flags & MS_SILENT)) + cERROR(1, "cifs_mount failed w/return code = %d", rc); + root = ERR_PTR(rc); + unload_nls(volume_info->local_nls); + kfree(cifs_sb->mountdata); + kfree(cifs_sb); + goto out; + } + mnt_data.vol = volume_info; mnt_data.cifs_sb = cifs_sb; mnt_data.flags = flags; @@ -699,11 +701,13 @@ cifs_do_mount(struct file_system_type *fs_type, sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data); if (IS_ERR(sb)) { root = ERR_CAST(sb); + cifs_umount(cifs_sb); goto out_cifs_sb; } if (sb->s_fs_info) { cFYI(1, "Use existing superblock"); + cifs_umount(cifs_sb); kfree(cifs_sb->mountdata); unload_nls(cifs_sb->local_nls); kfree(cifs_sb); @@ -715,8 +719,7 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_NODIRATIME | MS_NOATIME; sb->s_fs_info = cifs_sb; - rc = cifs_read_super(sb, volume_info, dev_name, - flags & MS_SILENT ? 1 : 0); + rc = cifs_read_super(sb); if (rc) { root = ERR_PTR(rc); goto out_super; -- cgit v1.2.3-70-g09d2 From 98ab494dd1d25388981114057cf9446250cc7dc7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:32:10 -0400 Subject: cifs: move cifs_umount() call into ->kill_sb() instead of calling it manually in case if cifs_read_super() fails to set ->s_root, just call it from ->kill_sb(). cifs_put_super() is gone now *and* we have cifs_sb shutdown and destruction done after the superblock is gone from ->s_instances. Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 4004bc647a7..15de4561dbc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -163,30 +163,14 @@ out_no_root: if (inode) iput(inode); - cifs_umount(cifs_sb); return rc; } -static void -cifs_put_super(struct super_block *sb) -{ - int rc = 0; - struct cifs_sb_info *cifs_sb; - - cFYI(1, "In cifs_put_super"); - cifs_sb = CIFS_SB(sb); - if (cifs_sb == NULL) { - cFYI(1, "Empty cifs superblock info passed to unmount"); - return; - } - - cifs_umount(cifs_sb); -} - static void cifs_kill_sb(struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); kill_anon_super(sb); + cifs_umount(cifs_sb); kfree(cifs_sb->mountdata); unload_nls(cifs_sb->local_nls); kfree(cifs_sb); @@ -537,7 +521,6 @@ static int cifs_drop_inode(struct inode *inode) } static const struct super_operations cifs_super_ops = { - .put_super = cifs_put_super, .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, .destroy_inode = cifs_destroy_inode, -- cgit v1.2.3-70-g09d2 From d757d71bfc30669a500b72792067e8d1c5d401a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:42:43 -0400 Subject: cifs: pull freeing mountdata/dropping nls/freeing cifs_sb into cifs_umount() all callers of cifs_umount() proceed to do the same thing; pull it into cifs_umount() itself. Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 15 +-------------- fs/cifs/connect.c | 3 +++ 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 15de4561dbc..46960b7ee43 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -171,9 +171,6 @@ static void cifs_kill_sb(struct super_block *sb) struct cifs_sb_info *cifs_sb = CIFS_SB(sb); kill_anon_super(sb); cifs_umount(cifs_sb); - kfree(cifs_sb->mountdata); - unload_nls(cifs_sb->local_nls); - kfree(cifs_sb); } static int @@ -685,15 +682,12 @@ cifs_do_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { root = ERR_CAST(sb); cifs_umount(cifs_sb); - goto out_cifs_sb; + goto out; } if (sb->s_fs_info) { cFYI(1, "Use existing superblock"); cifs_umount(cifs_sb); - kfree(cifs_sb->mountdata); - unload_nls(cifs_sb->local_nls); - kfree(cifs_sb); goto out_shared; } @@ -725,13 +719,6 @@ out_shared: out_super: deactivate_locked_super(sb); - goto out; - -out_cifs_sb: - kfree(cifs_sb->mountdata); - unload_nls(cifs_sb->local_nls); - kfree(cifs_sb); - out: cifs_cleanup_volume_info(&volume_info); return root; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b2702226634..ca7fbe3d51a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3362,6 +3362,9 @@ cifs_umount(struct cifs_sb_info *cifs_sb) spin_unlock(&cifs_sb->tlink_tree_lock); bdi_destroy(&cifs_sb->bdi); + kfree(cifs_sb->mountdata); + unload_nls(cifs_sb->local_nls); + kfree(cifs_sb); } int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) -- cgit v1.2.3-70-g09d2 From ee01a14d9ddcf3f832f9ceb837888501cb496e27 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:47:23 -0400 Subject: cifs: close sget() races have ->s_fs_info set by the set() callback passed to sget() Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 46960b7ee43..ba2b2da360d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -629,6 +629,13 @@ out: return dparent; } +static int cifs_set_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = data; + sb->s_fs_info = mnt_data->cifs_sb; + return set_anon_super(sb, NULL); +} + static struct dentry * cifs_do_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -678,14 +685,14 @@ cifs_do_mount(struct file_system_type *fs_type, mnt_data.cifs_sb = cifs_sb; mnt_data.flags = flags; - sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data); + sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data); if (IS_ERR(sb)) { root = ERR_CAST(sb); cifs_umount(cifs_sb); goto out; } - if (sb->s_fs_info) { + if (sb->s_root) { cFYI(1, "Use existing superblock"); cifs_umount(cifs_sb); goto out_shared; @@ -694,7 +701,6 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags = flags; /* BB should we make this contingent on mount parm? */ sb->s_flags |= MS_NODIRATIME | MS_NOATIME; - sb->s_fs_info = cifs_sb; rc = cifs_read_super(sb); if (rc) { -- cgit v1.2.3-70-g09d2 From fa18f1bdce898f0efd0c8639c901d826d01be04f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:50:44 -0400 Subject: cifs: more breakage on mount failures if cifs_get_root() fails, we end up with ->mount() returning NULL, which is not what callers expect. Moreover, in case of superblock reuse we end up leaking a superblock reference... Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ba2b2da360d..234e9d08db7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -710,19 +710,16 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; +out_shared: root = cifs_get_root(volume_info, sb); - if (root == NULL) + if (root == NULL) { + root = ERR_PTR(-EINVAL); /* XXX */ goto out_super; + } cFYI(1, "dentry root is: %p", root); goto out; -out_shared: - root = cifs_get_root(volume_info, sb); - if (root) - cFYI(1, "dentry root is: %p", root); - goto out; - out_super: deactivate_locked_super(sb); out: -- cgit v1.2.3-70-g09d2 From 5c4f1ad7c6aa3b729bd3a93b80f9417d7e978c32 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 09:56:55 -0400 Subject: cifs: tidy cifs_do_mount() up a bit Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 234e9d08db7..9a6696a5eb7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -656,16 +656,13 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); if (cifs_sb == NULL) { root = ERR_PTR(-ENOMEM); - unload_nls(volume_info->local_nls); - goto out; + goto out_nls; } cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { root = ERR_PTR(-ENOMEM); - unload_nls(volume_info->local_nls); - kfree(cifs_sb); - goto out; + goto out_cifs_sb; } cifs_setup_cifs_sb(volume_info, cifs_sb); @@ -675,10 +672,7 @@ cifs_do_mount(struct file_system_type *fs_type, if (!(flags & MS_SILENT)) cERROR(1, "cifs_mount failed w/return code = %d", rc); root = ERR_PTR(rc); - unload_nls(volume_info->local_nls); - kfree(cifs_sb->mountdata); - kfree(cifs_sb); - goto out; + goto out_mountdata; } mnt_data.vol = volume_info; @@ -695,22 +689,20 @@ cifs_do_mount(struct file_system_type *fs_type, if (sb->s_root) { cFYI(1, "Use existing superblock"); cifs_umount(cifs_sb); - goto out_shared; - } - - sb->s_flags = flags; - /* BB should we make this contingent on mount parm? */ - sb->s_flags |= MS_NODIRATIME | MS_NOATIME; + } else { + sb->s_flags = flags; + /* BB should we make this contingent on mount parm? */ + sb->s_flags |= MS_NODIRATIME | MS_NOATIME; + + rc = cifs_read_super(sb); + if (rc) { + root = ERR_PTR(rc); + goto out_super; + } - rc = cifs_read_super(sb); - if (rc) { - root = ERR_PTR(rc); - goto out_super; + sb->s_flags |= MS_ACTIVE; } - sb->s_flags |= MS_ACTIVE; - -out_shared: root = cifs_get_root(volume_info, sb); if (root == NULL) { root = ERR_PTR(-EINVAL); /* XXX */ @@ -725,6 +717,14 @@ out_super: out: cifs_cleanup_volume_info(&volume_info); return root; + +out_mountdata: + kfree(cifs_sb->mountdata); +out_cifs_sb: + kfree(cifs_sb); +out_nls: + unload_nls(volume_info->local_nls); + goto out; } static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, -- cgit v1.2.3-70-g09d2 From 9403c9c598e91d473c0582066e47ed2289292e45 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 10:02:59 -0400 Subject: cifs: propagate errors from cifs_get_root() to mount(2) ... instead of just failing with -EINVAL Acked-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9a6696a5eb7..35f9154615f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -554,7 +554,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) - return NULL; + return ERR_PTR(-ENOMEM); cFYI(1, "Get root dentry for %s", full_path); @@ -583,7 +583,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dchild = d_alloc(dparent, &name); if (dchild == NULL) { dput(dparent); - dparent = NULL; + dparent = ERR_PTR(-ENOMEM); goto out; } } @@ -601,7 +601,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) if (rc) { dput(dchild); dput(dparent); - dparent = NULL; + dparent = ERR_PTR(rc); goto out; } alias = d_materialise_unique(dchild, inode); @@ -609,7 +609,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dput(dchild); if (IS_ERR(alias)) { dput(dparent); - dparent = NULL; + dparent = ERR_PTR(-EINVAL); /* XXX */ goto out; } dchild = alias; @@ -704,10 +704,8 @@ cifs_do_mount(struct file_system_type *fs_type, } root = cifs_get_root(volume_info, sb); - if (root == NULL) { - root = ERR_PTR(-EINVAL); /* XXX */ + if (IS_ERR(root)) goto out_super; - } cFYI(1, "dentry root is: %p", root); goto out; -- cgit v1.2.3-70-g09d2 From e0f5406727f1dfdc47b8ba4a0ff6eae4b0b5ed4c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 18 Jun 2011 20:26:38 +0000 Subject: Btrfs: fix type mismatch in find_free_extent() data parameter should be u64 because a full-sized chunk flags field is passed instead of 0/1 for distinguishing data from metadata. All underlying functions expect u64. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1f61bf5b496..71cd456fdb6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4842,7 +4842,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - int data) + u64 data) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -4869,7 +4869,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, space_info = __find_space_info(root->fs_info, data); if (!space_info) { - printk(KERN_ERR "No space info for %d\n", data); + printk(KERN_ERR "No space info for %llu\n", data); return -ENOSPC; } -- cgit v1.2.3-70-g09d2 From 9b90f5135320bc74dc6c9a8c74d69fd4821d9282 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jun 2011 16:02:51 +0000 Subject: Btrfs: make sure to update total_bitmaps when freeing cache V3 A user reported this bug again where we have more bitmaps than we are supposed to. This is because we failed to load the free space cache, but don't update the ctl->total_bitmaps counter when we remove entries from the tree. This patch fixes this problem and we should be good to go again. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9f985a42987..bf0d61567f3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1893,9 +1893,12 @@ void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); - unlink_free_space(ctl, info); - kfree(info->bitmap); - kmem_cache_free(btrfs_free_space_cachep, info); + if (!info->bitmap) { + unlink_free_space(ctl, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } if (need_resched()) { spin_unlock(&ctl->tree_lock); cond_resched(); -- cgit v1.2.3-70-g09d2 From 2f7e33d432d097a2a7f467b031bf18be91cb3d49 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 23 Jun 2011 07:27:13 +0000 Subject: btrfs: fix inconsonant inode information When iputting the inode, We may leave the delayed nodes if they have some delayed items that have not been dealt with. So when the inode is read again, we must look up the relative delayed node, and use the information in it to initialize the inode. Or we will get inconsonant inode information, it may cause that the same directory index number is allocated again, and hit the following oops: [ 5447.554187] err add delayed dir index item(name: pglog_0.965_0) into the insertion tree of the delayed node(root id: 262, inode id: 258, errno: -17) [ 5447.569766] ------------[ cut here ]------------ [ 5447.575361] kernel BUG at fs/btrfs/delayed-inode.c:1301! [SNIP] [ 5447.790721] Call Trace: [ 5447.793191] [] btrfs_insert_dir_item+0x189/0x1bb [btrfs] [ 5447.800156] [] btrfs_add_link+0x12b/0x191 [btrfs] [ 5447.806517] [] btrfs_add_nondir+0x31/0x58 [btrfs] [ 5447.812876] [] btrfs_create+0xf9/0x197 [btrfs] [ 5447.818961] [] vfs_create+0x72/0x92 [ 5447.824090] [] do_last+0x22c/0x40b [ 5447.829133] [] path_openat+0xc0/0x2ef [ 5447.834438] [] ? __perf_event_task_sched_out+0x24/0x44 [ 5447.841216] [] ? perf_event_task_sched_out+0x59/0x67 [ 5447.847846] [] do_filp_open+0x3d/0x87 [ 5447.853156] [] ? strncpy_from_user+0x43/0x4d [ 5447.859072] [] ? getname_flags+0x2e/0x80 [ 5447.864636] [] ? do_getname+0x14b/0x173 [ 5447.870112] [] ? audit_getname+0x16/0x26 [ 5447.875682] [] ? spin_lock+0xe/0x10 [ 5447.880882] [] do_sys_open+0x69/0xae [ 5447.886153] [] sys_open+0x20/0x22 [ 5447.891114] [] system_call_fastpath+0x16/0x1b Fix it by reusing the old delayed node. Reported-by: Jim Schutt Signed-off-by: Miao Xie Tested-by: Jim Schutt Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 104 ++++++++++++++++++++++++++++++++++++----------- fs/btrfs/delayed-inode.h | 1 + fs/btrfs/inode.c | 12 +++++- 3 files changed, 91 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f1cbd028f7b..98c68e658a9 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -82,19 +82,16 @@ static inline struct btrfs_delayed_root *btrfs_get_delayed_root( return root->fs_info->delayed_root; } -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct inode *inode) +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) { - struct btrfs_delayed_node *node; struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(inode); - int ret; + struct btrfs_delayed_node *node; -again: node = ACCESS_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); /* can be accessed */ + atomic_inc(&node->refs); return node; } @@ -102,8 +99,10 @@ again: node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); - goto again; + return node; } btrfs_inode->delayed_node = node; atomic_inc(&node->refs); /* can be accessed */ @@ -113,6 +112,23 @@ again: } spin_unlock(&root->inode_lock); + return NULL; +} + +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); @@ -548,19 +564,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_delayed_node *btrfs_get_delayed_node( - struct inode *inode) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - struct btrfs_delayed_node *delayed_node; - - delayed_node = btrfs_inode->delayed_node; - if (delayed_node) - atomic_inc(&delayed_node->refs); - - return delayed_node; -} - static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, u64 root_id) { @@ -1404,8 +1407,7 @@ end: int btrfs_inode_delayed_dir_index_count(struct inode *inode) { - struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node; - int ret = 0; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1415,11 +1417,14 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) * a new directory index is added into the delayed node and index_cnt * is updated now. So we needn't lock the delayed node. */ - if (!delayed_node->index_cnt) + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); return -EINVAL; + } BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; - return ret; + btrfs_release_delayed_node(delayed_node); + return 0; } void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, @@ -1613,6 +1618,57 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_ctime.tv_nsec); } +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->inode_dirty) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + inode->i_uid = btrfs_stack_inode_uid(inode_item); + inode->i_gid = btrfs_stack_inode_gid(inode_item); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index d1a6a2915c6..8d27af4bd8b 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -119,6 +119,7 @@ void btrfs_kill_delayed_inode_items(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 87f1e0cf26f..447612d3a16 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2509,6 +2509,11 @@ static void btrfs_read_locked_inode(struct inode *inode) int maybe_acls; u32 rdev; int ret; + bool filled = false; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; path = btrfs_alloc_path(); BUG_ON(!path); @@ -2520,6 +2525,10 @@ static void btrfs_read_locked_inode(struct inode *inode) goto make_bad; leaf = path->nodes[0]; + + if (filled) + goto cache_acl; + inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); if (!leaf->map_token) @@ -2556,7 +2565,7 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - +cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls @@ -2572,7 +2581,6 @@ static void btrfs_read_locked_inode(struct inode *inode) } btrfs_free_path(path); - inode_item = NULL; switch (inode->i_mode & S_IFMT) { case S_IFREG: -- cgit v1.2.3-70-g09d2 From 2b4b2482e70eba10dd98653a3a5ac68126565e24 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 27 Jun 2011 16:18:06 -0700 Subject: romfs: fix romfs_get_unmapped_area() argument check romfs_get_unmapped_area() checks argument `len' without considering PAGE_ALIGN which will cause do_mmap_pgoff() return -EINVAL error after commit f67d9b1576c ("nommu: add page_align to mmap"). Fix the check by changing it in same way ramfs_nommu_get_unmapped_area() was changed in ramfs/file-nommu.c. Signed-off-by: Bob Liu Cc: David Howells Cc: Paul Mundt Acked-by: Greg Ungerer Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/romfs/mmap-nommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index f0511e81696..eed99428f10 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -27,14 +27,18 @@ static unsigned long romfs_get_unmapped_area(struct file *file, { struct inode *inode = file->f_mapping->host; struct mtd_info *mtd = inode->i_sb->s_mtd; - unsigned long isize, offset; + unsigned long isize, offset, maxpages, lpages; if (!mtd) goto cant_map_directly; + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; isize = i_size_read(inode); offset = pgoff << PAGE_SHIFT; - if (offset > isize || len > isize || offset > isize - len) + + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) return (unsigned long) -EINVAL; /* we need to call down to the MTD layer to do the actual mapping */ -- cgit v1.2.3-70-g09d2 From 08142579b6ca35883c1ed066a2681de6f6917062 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Jun 2011 16:18:10 -0700 Subject: mm: fix assertion mapping->nrpages == 0 in end_writeback() Under heavy memory and filesystem load, users observe the assertion mapping->nrpages == 0 in end_writeback() trigger. This can be caused by page reclaim reclaiming the last page from a mapping in the following race: CPU0 CPU1 ... shrink_page_list() __remove_mapping() __delete_from_page_cache() radix_tree_delete() evict_inode() truncate_inode_pages() truncate_inode_pages_range() pagevec_lookup() - finds nothing end_writeback() mapping->nrpages != 0 -> BUG page->mapping = NULL mapping->nrpages-- Fix the problem by doing a reliable check of mapping->nrpages under mapping->tree_lock in end_writeback(). Analyzed by Jay , lost in LKML, and dug out by Miklos Szeredi . Cc: Jay Cc: Miklos Szeredi Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 7 +++++++ include/linux/fs.h | 1 + mm/truncate.c | 5 +++++ 3 files changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 0f7e88a7803..43566d17d1b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -423,7 +423,14 @@ EXPORT_SYMBOL(remove_inode_hash); void end_writeback(struct inode *inode) { might_sleep(); + /* + * We have to cycle tree_lock here because reclaim can be still in the + * process of removing the last page (in __delete_from_page_cache()) + * and we must not free mapping under it. + */ + spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e73e2e9ae3..b5b97924786 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -639,6 +639,7 @@ struct address_space { struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ + /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ diff --git a/mm/truncate.c b/mm/truncate.c index 29a9b8a5a31..e13f22efaad 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_mutex. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { -- cgit v1.2.3-70-g09d2 From 1d1221f375c94ef961ba8574ac4f85c8870ddd51 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Fri, 24 Jun 2011 16:08:38 +0400 Subject: proc: restrict access to /proc/PID/io /proc/PID/io may be used for gathering private information. E.g. for openssh and vsftpd daemons wchars/rchars may be used to learn the precise password length. Restrict it to processes being able to ptrace the target process. ptrace_may_access() is needed to prevent keeping open file descriptor of "io" file, executing setuid binary and gathering io information of the setuid'ed process. Signed-off-by: Vasiliy Kulikov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8a84210ca08..fc5bc276769 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2708,6 +2708,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) struct task_io_accounting acct = task->ioac; unsigned long flags; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return -EACCES; + if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2839,7 +2842,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tgid_io_accounting), + INF("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), @@ -3181,7 +3184,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tid_io_accounting), + INF("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), -- cgit v1.2.3-70-g09d2 From 2bea038c52e88badbbd5b420c1de918f7f2579e9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 16 Jun 2011 11:35:46 -0400 Subject: pnfs: write: Set mds_offset in the generic layer - it is needed by all LDs In current pnfs tree, all the layouts set mds_offset in their .write_pagelist member. mds_offset is only used by generic layer and should be handled by it. This patch is for upstream. It is needed in this -rc series to fix a bug in objects layout_commit. I'll send patches for objects and blocks to be squashed into current pnfs tree. TODO: It looks like the read path needs the same patch. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 1 - fs/nfs/write.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 0bafcc91c27..f9d03abcd04 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -398,7 +398,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) * this offset and save the original offset. */ data->args.offset = filelayout_get_dserver_offset(lseg, offset); - data->mds_offset = offset; /* Perform an asynchronous write */ status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e268e3b2349..72716805968 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -864,6 +864,8 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pagevec; data->args.count = count; -- cgit v1.2.3-70-g09d2 From 50176ddefa4a942419cb693dd2d8345bfdcde67c Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 31 May 2011 16:35:50 -0500 Subject: hfsplus: add missing call to bio_put() hfsplus leaks bio objects by failing to call bio_put() on the bios it allocates. Add the missing call to fix the leak. Signed-off-by: Seth Forshee Cc: # .38.x, .39.x Signed-off-by: Christoph Hellwig --- fs/hfsplus/wrapper.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 3031d81f5f0..4ac88ff79aa 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -36,6 +36,7 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, { DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; + int ret = 0; bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = sector; @@ -54,8 +55,10 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, wait_for_completion(&wait); if (!bio_flagged(bio, BIO_UPTODATE)) - return -EIO; - return 0; + ret = -EIO; + + bio_put(bio); + return ret; } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) -- cgit v1.2.3-70-g09d2 From 032016a56a1e9c83646435b32e4416d499e1f1ce Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 24 Jun 2011 01:15:02 +0400 Subject: hfsplus: Fix double iput of the same inode in hfsplus_fill_super() There is a misprint in resource deallocation code on error path in hfsplus_fill_super(): the sbi->alloc_file inode is iput twice, while the root inode in not iput at all. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Christoph Hellwig --- fs/hfsplus/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b49b55584c8..84a47b709f5 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -500,7 +500,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) out_put_hidden_dir: iput(sbi->hidden_dir); out_put_root: - iput(sbi->alloc_file); + iput(root); out_put_alloc_file: iput(sbi->alloc_file); out_close_cat_tree: -- cgit v1.2.3-70-g09d2 From ee1b3ea9e6171d7a842527a44873f9f51e6f239b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 21 Jun 2011 07:18:26 -0400 Subject: cifs: set socket send and receive timeouts before attempting connect Benjamin S. reported that he was unable to suspend his machine while it had a cifs share mounted. The freezer caused this to spew when he tried it: -----------------------[snip]------------------ PM: Syncing filesystems ... done. Freezing user space processes ... (elapsed 0.01 seconds) done. Freezing remaining freezable tasks ... Freezing of tasks failed after 20.01 seconds (1 tasks refusing to freeze, wq_busy=0): cifsd S ffff880127f7b1b0 0 1821 2 0x00800000 ffff880127f7b1b0 0000000000000046 ffff88005fe008a8 ffff8800ffffffff ffff880127cee6b0 0000000000011100 ffff880127737fd8 0000000000004000 ffff880127737fd8 0000000000011100 ffff880127f7b1b0 ffff880127736010 Call Trace: [] ? sk_reset_timer+0xf/0x19 [] ? tcp_connect+0x43c/0x445 [] ? tcp_v4_connect+0x40d/0x47f [] ? schedule_timeout+0x21/0x1ad [] ? _raw_spin_lock_bh+0x9/0x1f [] ? release_sock+0x19/0xef [] ? inet_stream_connect+0x14c/0x24a [] ? autoremove_wake_function+0x0/0x2a [] ? ipv4_connect+0x39c/0x3b5 [cifs] [] ? cifs_reconnect+0x1fc/0x28a [cifs] [] ? cifs_demultiplex_thread+0x397/0xb9f [cifs] [] ? perf_event_exit_task+0xb9/0x1bf [] ? cifs_demultiplex_thread+0x0/0xb9f [cifs] [] ? cifs_demultiplex_thread+0x0/0xb9f [cifs] [] ? kthread+0x7a/0x82 [] ? kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x82 [] ? kernel_thread_helper+0x0/0x10 Restarting tasks ... done. -----------------------[snip]------------------ We do attempt to perform a try_to_freeze in cifs_reconnect, but the connection attempt itself seems to be taking longer than 20s to time out. The connect timeout is governed by the socket send and receive timeouts, so we can shorten that period by setting those timeouts before attempting the connect instead of after. Adam Williamson tested the patch and said that it seems to have fixed suspending on his laptop when a cifs share is mounted. Reported-by: Benjamin S Tested-by: Adam Williamson Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7f540df5252..c8cb83ef6f6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2474,14 +2474,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) return rc; - rc = socket->ops->connect(socket, saddr, slen, 0); - if (rc < 0) { - cFYI(1, "Error %d connecting to server", rc); - sock_release(socket); - server->ssocket = NULL; - return rc; - } - /* * Eventually check for other socket options to change from * the default. sock_setsockopt not used because it expects @@ -2510,6 +2502,14 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); + rc = socket->ops->connect(socket, saddr, slen, 0); + if (rc < 0) { + cFYI(1, "Error %d connecting to server", rc); + sock_release(socket); + server->ssocket = NULL; + return rc; + } + if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); -- cgit v1.2.3-70-g09d2 From a51cb91d81f8e6fc4e5e08b772cc3ceb13ac9d37 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 6 Jul 2011 12:33:55 +0200 Subject: fs: fix lock initialization locks_alloc_lock() assumed that the allocated struct file_lock is already initialized to zero members. This is only true for the first allocation of the structure, after reuse some of the members will have random values. This will for example result in passing random fl_start values to userspace in fuse for FL_FLOCK locks, which is an information leak at best. Fix by reinitializing those members which may be non-zero after freeing. Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/locks.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 0a4f50dfadf..b286539d547 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -160,10 +160,28 @@ EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; +static void locks_init_lock_always(struct file_lock *fl) +{ + fl->fl_next = NULL; + fl->fl_fasync = NULL; + fl->fl_owner = NULL; + fl->fl_pid = 0; + fl->fl_nspid = NULL; + fl->fl_file = NULL; + fl->fl_flags = 0; + fl->fl_type = 0; + fl->fl_start = fl->fl_end = 0; +} + /* Allocate an empty lock structure. */ struct file_lock *locks_alloc_lock(void) { - return kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_always(fl); + + return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lock); @@ -200,17 +218,9 @@ void locks_init_lock(struct file_lock *fl) INIT_LIST_HEAD(&fl->fl_link); INIT_LIST_HEAD(&fl->fl_block); init_waitqueue_head(&fl->fl_wait); - fl->fl_next = NULL; - fl->fl_fasync = NULL; - fl->fl_owner = NULL; - fl->fl_pid = 0; - fl->fl_nspid = NULL; - fl->fl_file = NULL; - fl->fl_flags = 0; - fl->fl_type = 0; - fl->fl_start = fl->fl_end = 0; fl->fl_ops = NULL; fl->fl_lmops = NULL; + locks_init_lock_always(fl); } EXPORT_SYMBOL(locks_init_lock); -- cgit v1.2.3-70-g09d2 From bcb65a797eb7c51e4f227dd19295f14d38738fee Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Jul 2011 12:26:05 +0100 Subject: FDPIC: Fix memory leak The shdr4extnum variable isn't being freed in the cleanup process of elf_fdpic_core_dump(). Signed-off-by: Davidlohr Bueso Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 63039ed9576..2bc5dc644b4 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1864,6 +1864,7 @@ cleanup: kfree(psinfo); kfree(notes); kfree(fpu); + kfree(shdr4extnum); #ifdef ELF_CORE_COPY_XFPREGS kfree(xfpu); #endif -- cgit v1.2.3-70-g09d2 From 677d8537d875832019fa989186f084ba47ecd93d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 5 Jul 2011 17:37:37 -0400 Subject: cifs: remove bogus call to cifs_cleanup_volume_info This call to cifs_cleanup_volume_info is clearly wrong. As soon as it's called the following call to cifs_get_tcp_session will oops as the volume_info pointer will then be NULL. The caller of cifs_mount should clean up this data since it passed it in. There's no need for us to call this here. Regression introduced by commit 724d9f1cfba. Reported-by: Adam Williamson Cc: Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c8cb83ef6f6..545e8546574 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3012,7 +3012,6 @@ try_mount_again: else if (pSesInfo) cifs_put_smb_ses(pSesInfo); - cifs_cleanup_volume_info(&volume_info); FreeXid(xid); } #endif -- cgit v1.2.3-70-g09d2 From b2a0fa152072f0085fa8d8eb0dbf9b3b0c5952fc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 6 Jul 2011 08:10:36 -0400 Subject: cifs: fix build_unc_path_to_root to account for a prefixpath Regression introduced by commit f87d39d9513. Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/connect.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 545e8546574..44376ce41e4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2855,19 +2855,28 @@ cifs_cleanup_volume_info(struct smb_vol **pvolume_info) /* build_path_to_root returns full path to root when * we do not have an exiting connection (tcon) */ static char * -build_unc_path_to_root(const struct smb_vol *volume_info, +build_unc_path_to_root(const struct smb_vol *vol, const struct cifs_sb_info *cifs_sb) { - char *full_path; + char *full_path, *pos; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; + unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); - int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1); - full_path = kmalloc(unc_len + 1, GFP_KERNEL); + full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); if (full_path == NULL) return ERR_PTR(-ENOMEM); - strncpy(full_path, volume_info->UNC, unc_len); - full_path[unc_len] = 0; /* add trailing null */ + strncpy(full_path, vol->UNC, unc_len); + pos = full_path + unc_len; + + if (pplen) { + strncpy(pos, vol->prepath, pplen); + pos += pplen; + } + + *pos = '\0'; /* add trailing null */ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + cFYI(1, "%s: full_path=%s", __func__, full_path); return full_path; } -- cgit v1.2.3-70-g09d2 From f9e59bcba2cff580a3ccf62e89460f9eed295d89 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 6 Jul 2011 08:10:37 -0400 Subject: cifs: have cifs_cleanup_volume_info not take a double pointer ...as that makes for a cumbersome interface. Make it take a regular smb_vol pointer and rely on the caller to zero it out if needed. Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 +- fs/cifs/cifsproto.h | 2 +- fs/cifs/connect.c | 10 +++------- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 35f9154615f..e11b8314983 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -713,7 +713,7 @@ cifs_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); out: - cifs_cleanup_volume_info(&volume_info); + cifs_cleanup_volume_info(volume_info); return root; out_mountdata: diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 257f312ede4..53b1b13581c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -154,7 +154,7 @@ extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); -extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info); +extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); extern int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, const char *devname); extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 44376ce41e4..dd064075ddf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2831,14 +2831,11 @@ is_path_accessible(int xid, struct cifs_tcon *tcon, } void -cifs_cleanup_volume_info(struct smb_vol **pvolume_info) +cifs_cleanup_volume_info(struct smb_vol *volume_info) { - struct smb_vol *volume_info; - - if (!pvolume_info || !*pvolume_info) + if (!volume_info) return; - volume_info = *pvolume_info; kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); @@ -2847,7 +2844,6 @@ cifs_cleanup_volume_info(struct smb_vol **pvolume_info) kfree(volume_info->iocharset); kfree(volume_info->prepath); kfree(volume_info); - *pvolume_info = NULL; return; } @@ -2990,7 +2986,7 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, *pvolume_info = volume_info; return rc; out: - cifs_cleanup_volume_info(&volume_info); + cifs_cleanup_volume_info(volume_info); return rc; } -- cgit v1.2.3-70-g09d2 From 1316d4da3f632d5843d5a446203e73067dc40f09 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Jul 2011 05:27:36 +0000 Subject: xfs: unpin stale inodes directly in IOP_COMMITTED When inodes are marked stale in a transaction, they are treated specially when the inode log item is being inserted into the AIL. It tries to avoid moving the log item forward in the AIL due to a race condition with the writing the underlying buffer back to disk. The was "fixed" in commit de25c18 ("xfs: avoid moving stale inodes in the AIL"). To avoid moving the item forward, we return a LSN smaller than the commit_lsn of the completing transaction, thereby trying to trick the commit code into not moving the inode forward at all. I'm not sure this ever worked as intended - it assumes the inode is already in the AIL, but I don't think the returned LSN would have been small enough to prevent moving the inode. It appears that the reason it worked is that the lower LSN of the inodes meant they were inserted into the AIL and flushed before the inode buffer (which was moved to the commit_lsn of the transaction). The big problem is that with delayed logging, the returning of the different LSN means insertion takes the slow, non-bulk path. Worse yet is that insertion is to a position -before- the commit_lsn so it is doing a AIL traversal on every insertion, and has to walk over all the items that have already been inserted into the AIL. It's expensive. To compound the matter further, with delayed logging inodes are likely to go from clean to stale in a single checkpoint, which means they aren't even in the AIL at all when we come across them at AIL insertion time. Hence these were all getting inserted into the AIL when they simply do not need to be as inodes marked XFS_ISTALE are never written back. Transactional/recovery integrity is maintained in this case by the other items in the unlink transaction that were modified (e.g. the AGI btree blocks) and committed in the same checkpoint. So to fix this, simply unpin the stale inodes directly in xfs_inode_item_committed() and return -1 to indicate that the AIL insertion code does not need to do any further processing of these inodes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_inode_item.c | 14 ++++++++------ fs/xfs/xfs_trans.c | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 09983a3344a..b1e88d56069 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -681,15 +681,15 @@ xfs_inode_item_unlock( * where the cluster buffer may be unpinned before the inode is inserted into * the AIL during transaction committed processing. If the buffer is unpinned * before the inode item has been committed and inserted, then it is possible - * for the buffer to be written and IO completions before the inode is inserted + * for the buffer to be written and IO completes before the inode is inserted * into the AIL. In that case, we'd be inserting a clean, stale inode into the * AIL which will never get removed. It will, however, get reclaimed which * triggers an assert in xfs_inode_free() complaining about freein an inode * still in the AIL. * - * To avoid this, return a lower LSN than the one passed in so that the - * transaction committed code will not move the inode forward in the AIL but - * will still unpin it properly. + * To avoid this, just unpin the inode directly and return a LSN of -1 so the + * transaction committed code knows that it does not need to do any further + * processing on the item. */ STATIC xfs_lsn_t xfs_inode_item_committed( @@ -699,8 +699,10 @@ xfs_inode_item_committed( struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - if (xfs_iflags_test(ip, XFS_ISTALE)) - return lsn - 1; + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_inode_item_unpin(lip, 0); + return -1; + } return lsn; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7c7bc2b786b..c83f63b33aa 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1361,7 +1361,7 @@ xfs_trans_item_committed( lip->li_flags |= XFS_LI_ABORTED; item_lsn = IOP_COMMITTED(lip, commit_lsn); - /* If the committed routine returns -1, item has been freed. */ + /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) return; @@ -1474,7 +1474,7 @@ xfs_trans_committed_bulk( lip->li_flags |= XFS_LI_ABORTED; item_lsn = IOP_COMMITTED(lip, commit_lsn); - /* item_lsn of -1 means the item was freed */ + /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) continue; -- cgit v1.2.3-70-g09d2 From 0942caa373c676dca614ea8352ac77e0270aba73 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 28 Jun 2011 15:10:37 +0000 Subject: btrfs: add missing options displayed in mount output There are three missed mount options settable by user which are not currently displayed in mount output. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/super.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8e948ec1ee6..60e13ef23a5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1336,6 +1336,11 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_STRING_ITEM_KEY 253 +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ #define BTRFS_MOUNT_NODATASUM (1 << 0) #define BTRFS_MOUNT_NODATACOW (1 << 1) #define BTRFS_MOUNT_NOBARRIER (1 << 2) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3559d0b3518..5746081199e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -721,6 +721,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(root, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(root, INODE_MAP_CACHE)) + seq_puts(seq, ",inode_cache"); return 0; } -- cgit v1.2.3-70-g09d2 From 508794eb5ec2a2b832742e78c6766844b10c0c94 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 2 Jul 2011 21:24:41 +0000 Subject: Btrfs: don't panic if we get an error while balancing V2 A user reported an error where if we try to balance an fs after a device has been removed it will blow up. This is because we get an EIO back and this is where BUG_ON(ret) bites us in the ass. To fix we just exit. Thanks, Reported-by: Anand Jain Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1efa56e18f9..19450bc5363 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2098,7 +2098,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk_root->root_key.objectid, found_key.objectid, found_key.offset); - BUG_ON(ret && ret != -ENOSPC); + if (ret && ret != -ENOSPC) + goto error; key.offset = found_key.offset - 1; } ret = 0; -- cgit v1.2.3-70-g09d2 From 149e2d76b4886c4c7ff5e077646a8ba3563c8026 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 6 Jul 2011 18:51:53 -0400 Subject: btrfs: fix oops when doing space balance We need to make sure the data relocation inode doesn't go through the delayed metadata updates, otherwise we get an oops during balance: kernel BUG at fs/btrfs/relocation.c:4303! [SNIP] Call Trace: [] ? update_ref_for_cow+0x22d/0x330 [btrfs] [] __btrfs_cow_block+0x451/0x5e0 [btrfs] [] ? read_block_for_search+0x14d/0x4d0 [btrfs] [] btrfs_cow_block+0x10b/0x240 [btrfs] [] btrfs_search_slot+0x49e/0x7a0 [btrfs] [] btrfs_lookup_inode+0x2f/0xa0 [btrfs] [] ? mutex_lock+0x1e/0x50 [] btrfs_update_delayed_inode+0x71/0x160 [btrfs] [] ? __btrfs_release_delayed_node+0x67/0x190 [btrfs] [] btrfs_run_delayed_items+0xe8/0x120 [btrfs] [] btrfs_commit_transaction+0x250/0x850 [btrfs] [] ? find_get_pages+0x39/0x130 [] ? join_transaction+0x25/0x250 [btrfs] [] ? wake_up_bit+0x40/0x40 [] prepare_to_relocate+0xda/0xf0 [btrfs] [] relocate_block_group+0x4b/0x620 [btrfs] [] ? btrfs_clean_old_snapshots+0x35/0x150 [btrfs] [] btrfs_relocate_block_group+0x1b3/0x2e0 [btrfs] [] ? btrfs_tree_unlock+0x50/0x50 [btrfs] [] btrfs_relocate_chunk+0x8b/0x670 [btrfs] [] ? btrfs_set_path_blocking+0x3d/0x50 [btrfs] [] ? read_extent_buffer+0xd8/0x1d0 [btrfs] [] ? btrfs_previous_item+0xb1/0x150 [btrfs] [] ? read_extent_buffer+0xd8/0x1d0 [btrfs] [] btrfs_balance+0x21a/0x2b0 [btrfs] [] btrfs_ioctl+0x798/0xd20 [btrfs] [] ? handle_mm_fault+0x148/0x270 [] ? do_page_fault+0x1d8/0x4b0 [] do_vfs_ioctl+0x9a/0x540 [] sys_ioctl+0xa1/0xb0 [] system_call_fastpath+0x16/0x1b [SNIP] RIP [] btrfs_reloc_cow_block+0x22c/0x270 [btrfs] Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 447612d3a16..4a137308374 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2678,12 +2678,14 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, int ret; /* - * If root is tree root, it means this inode is used to - * store free space information. And these inodes are updated - * when committing the transaction, so they needn't delaye to - * be updated, or deadlock will occured. + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay */ - if (!is_free_space_inode(root, inode)) { + if (!is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); -- cgit v1.2.3-70-g09d2 From c902ce1bfb40d8b049bd2319b388b4b68b04bc27 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 7 Jul 2011 12:19:48 +0100 Subject: FS-Cache: Add a helper to bulk uncache pages on an inode Add an FS-Cache helper to bulk uncache pages on an inode. This will only work for the circumstance where the pages in the cache correspond 1:1 with the pages attached to an inode's page cache. This is required for CIFS and NFS: When disabling inode cookie, we were returning the cookie and setting cifsi->fscache to NULL but failed to invalidate any previously mapped pages. This resulted in "Bad page state" errors and manifested in other kind of errors when running fsstress. Fix it by uncaching mapped pages when we disable the inode cookie. This patch should fix the following oops and "Bad page state" errors seen during fsstress testing. ------------[ cut here ]------------ kernel BUG at fs/cachefiles/namei.c:201! invalid opcode: 0000 [#1] SMP Pid: 5, comm: kworker/u:0 Not tainted 2.6.38.7-30.fc15.x86_64 #1 Bochs Bochs RIP: 0010: cachefiles_walk_to_object+0x436/0x745 [cachefiles] RSP: 0018:ffff88002ce6dd00 EFLAGS: 00010282 RAX: ffff88002ef165f0 RBX: ffff88001811f500 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000100 RDI: 0000000000000282 RBP: ffff88002ce6dda0 R08: 0000000000000100 R09: ffffffff81b3a300 R10: 0000ffff00066c0a R11: 0000000000000003 R12: ffff88002ae54840 R13: ffff88002ae54840 R14: ffff880029c29c00 R15: ffff88001811f4b0 FS: 00007f394dd32720(0000) GS:ffff88002ef00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007fffcb62ddf8 CR3: 000000001825f000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/u:0 (pid: 5, threadinfo ffff88002ce6c000, task ffff88002ce55cc0) Stack: 0000000000000246 ffff88002ce55cc0 ffff88002ce6dd58 ffff88001815dc00 ffff8800185246c0 ffff88001811f618 ffff880029c29d18 ffff88001811f380 ffff88002ce6dd50 ffffffff814757e4 ffff88002ce6dda0 ffffffff8106ac56 Call Trace: cachefiles_lookup_object+0x78/0xd4 [cachefiles] fscache_lookup_object+0x131/0x16d [fscache] fscache_object_work_func+0x1bc/0x669 [fscache] process_one_work+0x186/0x298 worker_thread+0xda/0x15d kthread+0x84/0x8c kernel_thread_helper+0x4/0x10 RIP cachefiles_walk_to_object+0x436/0x745 [cachefiles] ---[ end trace 1d481c9af1804caa ]--- I tested the uncaching by the following means: (1) Create a big file on my NFS server (104857600 bytes). (2) Read the file into the cache with md5sum on the NFS client. Look in /proc/fs/fscache/stats: Pages : mrk=25601 unc=0 (3) Open the file for read/write ("bash 5<>/warthog/bigfile"). Look in proc again: Pages : mrk=25601 unc=25601 Reported-by: Jeff Layton Signed-off-by: David Howells Reviewed-and-Tested-by: Suresh Jayaraman cc: stable@kernel.org Signed-off-by: Linus Torvalds --- Documentation/filesystems/caching/netfs-api.txt | 16 +++++++++ fs/cifs/fscache.c | 1 + fs/fscache/page.c | 44 +++++++++++++++++++++++++ fs/nfs/fscache.c | 8 ++--- include/linux/fscache.h | 21 ++++++++++++ 5 files changed, 85 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt index a167ab876c3..7cc6bf2871e 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.txt @@ -673,6 +673,22 @@ storage request to complete, or it may attempt to cancel the storage request - in which case the page will not be stored in the cache this time. +BULK INODE PAGE UNCACHE +----------------------- + +A convenience routine is provided to perform an uncache on all the pages +attached to an inode. This assumes that the pages on the inode correspond on a +1:1 basis with the pages in the cache. + + void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode); + +This takes the netfs cookie that the pages were cached with and the inode that +the pages are attached to. This function will wait for pages to finish being +written to the cache and for the cache to finish with the page generally. No +error is returned. + + ========================== INDEX AND DATA FILE UPDATE ========================== diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 816696621ec..42e5363b410 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -92,6 +92,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) if (cifsi->fscache) { cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + fscache_uncache_all_inode_pages(cifsi->fscache, inode); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index a2a5d19ece6..2f343b4d7a7 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -954,3 +954,47 @@ void fscache_mark_pages_cached(struct fscache_retrieval *op, pagevec_reinit(pagevec); } EXPORT_SYMBOL(fscache_mark_pages_cached); + +/* + * Uncache all the pages in an inode that are marked PG_fscache, assuming them + * to be associated with the given cookie. + */ +void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t next; + int i; + + _enter("%p,%p", cookie, inode); + + if (!mapping || mapping->nrpages == 0) { + _leave(" [no pages]"); + return; + } + + pagevec_init(&pvec, 0); + next = 0; + while (next <= (loff_t)-1 && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE) + ) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + ASSERTCMP(page_index, >=, next); + next = page_index + 1; + + if (PageFsCache(page)) { + __fscache_wait_on_page_write(cookie, page); + __fscache_uncache_page(cookie, page); + } + } + pagevec_release(&pvec); + cond_resched(); + } + + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ce153a6b3ae..419119c371b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -259,12 +259,10 @@ static void nfs_fscache_disable_inode_cookie(struct inode *inode) dfprintk(FSCACHE, "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - /* Need to invalidate any mapped pages that were read in before - * turning off the cache. + /* Need to uncache any pages attached to this inode that + * fscache knows about before turning off the cache. */ - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2(inode->i_mapping); - + fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); nfs_fscache_zap_inode_cookie(inode); } } diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 7c4d72f5581..9ec20dec335 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -204,6 +204,8 @@ extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *); extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *); extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *, gfp_t); +extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *, + struct inode *); /** * fscache_register_netfs - Register a filesystem as desiring caching services @@ -643,4 +645,23 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie, return false; } +/** + * fscache_uncache_all_inode_pages - Uncache all an inode's pages + * @cookie: The cookie representing the inode's cache object. + * @inode: The inode to uncache pages from. + * + * Uncache all the pages in an inode that are marked PG_fscache, assuming them + * to be associated with the given cookie. + * + * This function may sleep. It will wait for pages that are being written out + * and will wait whilst the PG_fscache mark is removed by the cache. + */ +static inline +void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode) +{ + if (fscache_cookie_valid(cookie)) + __fscache_uncache_all_inode_pages(cookie, inode); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3-70-g09d2 From 04db79b015dafcb79371fda6b5c32ffdbd31a2ff Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 6 Jul 2011 08:10:38 -0400 Subject: cifs: factor smb_vol allocation out of cifs_setup_volume_info Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 6 +++--- fs/cifs/cifsproto.h | 4 ++-- fs/cifs/connect.c | 56 ++++++++++++++++++++++++++--------------------------- 3 files changed, 33 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e11b8314983..3e298997629 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -649,9 +649,9 @@ cifs_do_mount(struct file_system_type *fs_type, cFYI(1, "Devname: %s flags: %d ", dev_name, flags); - rc = cifs_setup_volume_info(&volume_info, (char *)data, dev_name); - if (rc) - return ERR_PTR(rc); + volume_info = cifs_get_volume_info((char *)data, dev_name); + if (IS_ERR(volume_info)) + return ERR_CAST(volume_info); cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); if (cifs_sb == NULL) { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 53b1b13581c..8df28e925e5 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -155,8 +155,8 @@ extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); -extern int cifs_setup_volume_info(struct smb_vol **pvolume_info, - char *mount_data, const char *devname); +extern struct smb_vol *cifs_get_volume_info(char *mount_data, + const char *devname); extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_dfs_release_automount_timer(void); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index dd064075ddf..46cc0ad0348 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2931,33 +2931,20 @@ expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, } #endif -int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, - const char *devname) +static int +cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname) { - struct smb_vol *volume_info; int rc = 0; - *pvolume_info = NULL; - - volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); - if (!volume_info) { - rc = -ENOMEM; - goto out; - } - - if (cifs_parse_mount_options(mount_data, devname, - volume_info)) { - rc = -EINVAL; - goto out; - } + if (cifs_parse_mount_options(mount_data, devname, volume_info)) + return -EINVAL; if (volume_info->nullauth) { cFYI(1, "null user"); volume_info->username = kzalloc(1, GFP_KERNEL); - if (volume_info->username == NULL) { - rc = -ENOMEM; - goto out; - } + if (volume_info->username == NULL) + return -ENOMEM; } else if (volume_info->username) { /* BB fixme parse for domain name here */ cFYI(1, "Username: %s", volume_info->username); @@ -2965,8 +2952,7 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, cifserror("No username specified"); /* In userspace mount helper we can get user name from alternate locations such as env variables and files on disk */ - rc = -EINVAL; - goto out; + return -EINVAL; } /* this is needed for ASCII cp to Unicode converts */ @@ -2978,18 +2964,32 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, if (volume_info->local_nls == NULL) { cERROR(1, "CIFS mount error: iocharset %s not found", volume_info->iocharset); - rc = -ELIBACC; - goto out; + return -ELIBACC; } } - *pvolume_info = volume_info; - return rc; -out: - cifs_cleanup_volume_info(volume_info); return rc; } +struct smb_vol * +cifs_get_volume_info(char *mount_data, const char *devname) +{ + int rc; + struct smb_vol *volume_info; + + volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); + if (!volume_info) + return ERR_PTR(-ENOMEM); + + rc = cifs_setup_volume_info(volume_info, mount_data, devname); + if (rc) { + cifs_cleanup_volume_info(volume_info); + volume_info = ERR_PTR(rc); + } + + return volume_info; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { -- cgit v1.2.3-70-g09d2 From 20547490c12b0ee3d32152b85e9f9bd183aa7224 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 9 Jul 2011 12:21:07 -0400 Subject: cifs: move bdi_setup_and_register outside of CONFIG_CIFS_DFS_UPCALL This needs to be done regardless of whether that KConfig option is set or not. Reported-by: Sven-Haegar Koch Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 46cc0ad0348..511176903e5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3002,6 +3002,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) struct tcon_link *tlink; #ifdef CONFIG_CIFS_DFS_UPCALL int referral_walks_count = 0; +#endif rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); if (rc) @@ -3009,6 +3010,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; +#ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ if (referral_walks_count) { -- cgit v1.2.3-70-g09d2 From b9bce2e9f9936cfd12fbc62ead11edcdd46dec7e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 6 Jul 2011 08:10:39 -0400 Subject: cifs: fix expand_dfs_referral Regression introduced in commit 724d9f1cfba. Prior to that, expand_dfs_referral would regenerate the mount data string and then call cifs_parse_mount_options to re-parse it (klunky, but it worked). The above commit moved cifs_parse_mount_options out of cifs_mount, so the re-parsing of the new mount options no longer occurred. Fix it by making expand_dfs_referral re-parse the mount options. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 511176903e5..6ec366ff28b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -65,6 +65,8 @@ static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); +static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname); /* * cifs tcp session reconnection @@ -2830,12 +2832,9 @@ is_path_accessible(int xid, struct cifs_tcon *tcon, return rc; } -void -cifs_cleanup_volume_info(struct smb_vol *volume_info) +static void +cleanup_volume_info_contents(struct smb_vol *volume_info) { - if (!volume_info) - return; - kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); @@ -2843,10 +2842,18 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info) kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); +} + +void +cifs_cleanup_volume_info(struct smb_vol *volume_info) +{ + if (!volume_info) + return; + cleanup_volume_info_contents(volume_info); kfree(volume_info); - return; } + #ifdef CONFIG_CIFS_DFS_UPCALL /* build_path_to_root returns full path to root when * we do not have an exiting connection (tcon) */ @@ -2915,15 +2922,18 @@ expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, &fake_devname); free_dfs_info_array(referrals, num_referrals); - kfree(fake_devname); - - if (cifs_sb->mountdata != NULL) - kfree(cifs_sb->mountdata); if (IS_ERR(mdata)) { rc = PTR_ERR(mdata); mdata = NULL; + } else { + cleanup_volume_info_contents(volume_info); + memset(volume_info, '\0', sizeof(*volume_info)); + rc = cifs_setup_volume_info(volume_info, mdata, + fake_devname); } + kfree(fake_devname); + kfree(cifs_sb->mountdata); cifs_sb->mountdata = mdata; } kfree(full_path); -- cgit v1.2.3-70-g09d2 From f484b5d001a972a42129570e98086a2a6d216ce0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Jul 2011 10:16:34 -0400 Subject: cifs: drop spinlock before calling cifs_put_tlink ...as that function can sleep. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6ec366ff28b..dbd669cc5bc 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2242,8 +2242,8 @@ cifs_match_super(struct super_block *sb, void *data) rc = compare_mount_options(sb, mnt_data); out: - cifs_put_tlink(tlink); spin_unlock(&cifs_tcp_ses_lock); + cifs_put_tlink(tlink); return rc; } -- cgit v1.2.3-70-g09d2 From e5012d1f3861d18c7f3814e757c1c3ab3741dbcd Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 11 Jul 2011 17:17:42 -0400 Subject: NFSv4.1: update nfs4_fattr_bitmap_maxsz Attribute IDs assigned in RFC 5661 now require three bitmaps. Fixes hitting a BUG_ON in xdr_shrink_bufhead when getting ACLs. Signed-off-by: Andy Adamson Cc:stable@kernel.org [2.6.39] Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 6870bc61cee..e6e8f3b9a1d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -91,7 +91,7 @@ static int nfs4_stat_to_errno(int); #define encode_getfh_maxsz (op_encode_hdr_maxsz) #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ ((3+NFS4_FHSIZE) >> 2)) -#define nfs4_fattr_bitmap_maxsz 3 +#define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) -- cgit v1.2.3-70-g09d2 From 1ce533686c7d40bf900dc346a7279c17a9ee8e0e Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Mon, 13 Jun 2011 14:27:40 -0500 Subject: GFS2: force a log flush when invalidating the rindex glock Right now, there is nothing that forces the log to get flushed when a node drops its rindex glock so that another node can grow the filesystem. If the log doesn't get flushed, GFS2 can corrupt the sd_log_le_rg list in the following way. A node puts an rgd on the list in rg_lo_add(), and then the rindex glock is dropped so the other node can grow the filesystem. When the node reacquires the rindex glock, that rgd gets deleted in clear_rgrpdi() before ever being removed from the list by gfs2_log_flush(). This code simply forces a log flush when the rindex glock is invalidated, solving the problem. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 8ef70f46473..712b722030a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -221,8 +221,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } } - if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_sbd, NULL); gl->gl_sbd->sd_rindex_uptodate = 0; + } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); } -- cgit v1.2.3-70-g09d2 From 3942ae5319640ced5844b75f44884e4bcb8a2f16 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 11 Jul 2011 08:53:30 +0100 Subject: GFS2: Fix race during filesystem mount There is a potential race during filesystem mounting which has recently been reported. It occurs when the userland gfs_controld is able to process requests fast enough that it tries to use the sysfs interface before the lock module is properly initialised. This is a pretty unusual case as normally the lock module initialisation is very quick compared with gfs_controld. This patch adds an interruptible completion which is used to ensure that userland will wait for the initialisation of the lock module to complete. There are other potential solutions to this problem, but this is the quickest at this stage and has been tested both with and without mount.gfs2 present in the system. Signed-off-by: Steven Whitehouse Reported-by: David Booher --- fs/gfs2/incore.h | 2 ++ fs/gfs2/ops_fstype.c | 3 +++ fs/gfs2/sys.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0a064e91ac7..81206e70cbf 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -17,6 +17,7 @@ #include #include #include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -546,6 +547,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_trans_gl; wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; + struct completion sd_locking_init; /* Inode Stuff */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8ac9ae189b5..2a77071fb7b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -72,6 +72,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -1017,11 +1018,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); + complete(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + complete(&sdp->sd_locking_init); return ret; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index e20eab37bc8..443cabcfcd2 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -338,6 +338,9 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = sscanf(buf, "%u", &first); if (rv != 1 || first > 1) return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; spin_lock(&sdp->sd_jindex_spin); rv = -EBUSY; if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) @@ -414,7 +417,9 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = sscanf(buf, "%d", &jid); if (rv != 1) return -EINVAL; - + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; spin_lock(&sdp->sd_jindex_spin); rv = -EINVAL; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) -- cgit v1.2.3-70-g09d2 From 62411ab2fe5f002dff27417630ddf02cc40ca404 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Sun, 10 Jul 2011 06:55:32 -0500 Subject: cifs: Fix signing failure when server mandates signing for NTLMSSP When using NTLMSSP authentication mechanism, if server mandates signing, keep the flags in type 3 messages of the NTLMSSP exchange same as in type 1 messages (i.e. keep the indicated capabilities same). Some of the servers such as Samba, expect the flags such as Negotiate_Key_Exchange in type 3 message of NTLMSSP exchange as well. Some servers like Windows do not. https://bugzilla.samba.org/show_bug.cgi?id=8212 Signed-off-by: Shirish Pargaonkar Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/sess.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3892ab817a3..d3e619692ee 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -428,8 +428,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; if (!ses->server->session_estab) - flags |= NTLMSSP_NEGOTIATE_KEY_XCH | - NTLMSSP_NEGOTIATE_EXTENDED_SEC; + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } sec_blob->NegotiateFlags = cpu_to_le32(flags); @@ -465,10 +464,11 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED) - flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; + if (!ses->server->session_estab) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + } tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); -- cgit v1.2.3-70-g09d2 From ea1be1a3c3c4b2bbc32aeb7995e18336c8060e0e Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Jul 2011 18:24:43 +0000 Subject: [CIFS] update limit for snprintf in cifs_construct_tcon In 34c87901e113 "Shrink stack space usage in cifs_construct_tcon" we change the size of the username name buffer from MAX_USERNAME_SIZE (256) to 28. This call to snprintf() needs to be updated as well. Reported by Dan Carpenter. Reviewed-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index dbd669cc5bc..ccc1afa0bf3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3485,7 +3485,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) goto out; } - snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid); + snprintf(username, sizeof(username), "krb50x%x", fsuid); vol_info->username = username; vol_info->local_nls = cifs_sb->local_nls; vol_info->linux_uid = fsuid; -- cgit v1.2.3-70-g09d2 From c2ec9471b5b1307429aef1cfaa2b3ae453a61d6f Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Jul 2011 19:15:02 +0000 Subject: [CIFS] update cifs to version 1.74 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0900e1658c9..036ca83e5f4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -129,5 +129,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.73" +#define CIFS_VERSION "1.74" #endif /* _CIFSFS_H */ -- cgit v1.2.3-70-g09d2 From 94c0d4ecbe7f9fe56e052b26b2ab484e246c07b4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 12 Jul 2011 21:40:23 -0400 Subject: Fix ->d_lock locking order in unlazy_walk() Make sure that child is still a child of parent before nested locking of child->d_lock in unlazy_walk(); otherwise we are risking a violation of locking order and deadlocks. Signed-off-by: Al Viro --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 0223c41fb11..5c867dd1c0b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -433,6 +433,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) goto err_parent; BUG_ON(nd->inode != parent->d_inode); } else { + if (dentry->d_parent != parent) + goto err_parent; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_child; -- cgit v1.2.3-70-g09d2 From 380f7c65a7eb3288e4b6812acf3474a1de230707 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 14 Jul 2011 08:59:44 +0100 Subject: GFS2: Resolve inode eviction and ail list interaction bug This patch contains a few misc fixes which resolve a recently reported issue. This patch has been a real team effort and has received a lot of testing. The first issue is that the ail lock needs to be held over a few more operations. The lock thats added into gfs2_releasepage() may possibly be a candidate for replacing with RCU at some future point, but at this stage we've gone for the obvious fix. The second issue is that gfs2_write_inode() can end up calling a glock recursively when called from gfs2_evict_inode() via the syncing code, so it needs a guard added. The third issue is that we either need to not truncate the metadata pages of inodes which have zero link count, but which we cannot deallocate due to them still being in use by other nodes, or we need to ensure that those pages have all made it through the journal and ail lists first. This patch takes the former approach, but the latter has also been tested and there is nothing to choose between them performance-wise. So again, we could revise that decision in the future. Also, the inode eviction process is now better documented. Signed-off-by: Steven Whitehouse Tested-by: Bob Peterson Tested-by: Abhijith Das Reported-by: Barry J. Marson Reported-by: David Teigland --- fs/gfs2/aops.c | 3 +++ fs/gfs2/glops.c | 4 ++-- fs/gfs2/log.c | 1 + fs/gfs2/super.c | 36 ++++++++++++++++++++++++++++++------ 4 files changed, 36 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 802ac5eeba2..f9fbbe96c22 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1069,6 +1069,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return 0; gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) @@ -1080,6 +1081,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) goto not_possible; bh = bh->b_this_page; } while(bh != head); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); head = bh = page_buffers(page); @@ -1112,6 +1114,7 @@ not_possible: /* Should never happen */ WARN_ON(buffer_dirty(bh)); WARN_ON(buffer_pinned(bh)); cannot_release: + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 712b722030a..2cca29316bd 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -47,10 +47,10 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl) bd_ail_gl_list); bh = bd->bd_bh; gfs2_remove_from_ail(bd); - spin_unlock(&sdp->sd_ail_lock); - bd->bd_bh = NULL; bh->b_private = NULL; + spin_unlock(&sdp->sd_ail_lock); + bd->bd_blkno = bh->b_blocknr; gfs2_log_lock(sdp); gfs2_assert_withdraw(sdp, !buffer_busy(bh)); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 903115f2bb3..85c62923ee2 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -903,6 +903,7 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) if (gfs2_ail1_empty(sdp)) break; } + gfs2_log_flush(sdp, NULL); } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ed540e7018b..fb0edf73548 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -757,13 +757,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct timespec atime; struct gfs2_dinode *di; int ret = -EAGAIN; + int unlock_required = 0; /* Skip timestamp update, if this is from a memalloc */ if (current->flags & PF_MEMALLOC) goto do_flush; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + unlock_required = 1; + } ret = gfs2_trans_begin(sdp, RES_DINODE, 0); if (ret) goto do_unlock; @@ -780,7 +784,8 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) } gfs2_trans_end(sdp); do_unlock: - gfs2_glock_dq_uninit(&gh); + if (unlock_required) + gfs2_glock_dq_uninit(&gh); do_flush: if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); @@ -1427,7 +1432,20 @@ out: return error; } -/* +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * * We have to (at the moment) hold the inodes main lock to cover * the gap between unlocking the shared lock on the iopen lock and * taking the exclusive lock. I'd rather do a shared -> exclusive @@ -1470,6 +1488,8 @@ static void gfs2_evict_inode(struct inode *inode) if (error) goto out_truncate; + /* Case 1 starts here */ + if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { error = gfs2_dir_exhash_dealloc(ip); @@ -1493,13 +1513,16 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: + /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) goto out_unlock; - gfs2_final_release_pages(ip); + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); gfs2_trans_end(sdp); out_unlock: + /* Error path for case 1 */ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); @@ -1507,6 +1530,7 @@ out_unlock: if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: + /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); -- cgit v1.2.3-70-g09d2 From 1836750115f20b774e55c032a3893e8c5bdf41ed Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 12 Jul 2011 21:42:24 -0400 Subject: fix loop checks in d_materialise_unique() Both __d_unalias() and __d_materialise_dentry() need loop prevention. Grab rename_lock in caller, check for loops there... As a side benefit, we have dentry_lock_for_move() called only under rename_lock, which seriously reduces deadlock potential of the execrable "locking order" used for ->d_lock. Signed-off-by: Al Viro --- fs/dcache.c | 51 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 37f72ee5bf7..6e4ea6d8777 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2213,14 +2213,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * The hash value has to match the hash queue that the dentry is on.. */ /* - * d_move - move a dentry + * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. + * dcache entries should not be moved in this way. Caller hold + * rename_lock. */ -void d_move(struct dentry * dentry, struct dentry * target) +static void __d_move(struct dentry * dentry, struct dentry * target) { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2228,8 +2229,6 @@ void d_move(struct dentry * dentry, struct dentry * target) BUG_ON(d_ancestor(dentry, target)); BUG_ON(d_ancestor(target, dentry)); - write_seqlock(&rename_lock); - dentry_lock_for_move(dentry, target); write_seqcount_begin(&dentry->d_seq); @@ -2275,6 +2274,20 @@ void d_move(struct dentry * dentry, struct dentry * target) spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); +} + +/* + * d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. + */ +void d_move(struct dentry *dentry, struct dentry *target) +{ + write_seqlock(&rename_lock); + __d_move(dentry, target); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); @@ -2302,7 +2315,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex and the inode->i_lock + * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2317,11 +2330,6 @@ static struct dentry *__d_unalias(struct inode *inode, if (alias->d_parent == dentry->d_parent) goto out_unalias; - /* Check for loops */ - ret = ERR_PTR(-ELOOP); - if (d_ancestor(alias, dentry)) - goto out_err; - /* See lock_rename() */ ret = ERR_PTR(-EBUSY); if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) @@ -2331,7 +2339,7 @@ static struct dentry *__d_unalias(struct inode *inode, goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: - d_move(alias, dentry); + __d_move(alias, dentry); ret = alias; out_err: spin_unlock(&inode->i_lock); @@ -2416,15 +2424,24 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) alias = __d_find_alias(inode, 0); if (alias) { actual = alias; - /* Is this an anonymous mountpoint that we could splice - * into our tree? */ - if (IS_ROOT(alias)) { + write_seqlock(&rename_lock); + + if (d_ancestor(alias, dentry)) { + /* Check for loops */ + actual = ERR_PTR(-ELOOP); + } else if (IS_ROOT(alias)) { + /* Is this an anonymous mountpoint that we + * could splice into our tree? */ __d_materialise_dentry(dentry, alias); + write_sequnlock(&rename_lock); __d_drop(alias); goto found; + } else { + /* Nope, but we must(!) avoid directory + * aliasing */ + actual = __d_unalias(inode, dentry, alias); } - /* Nope, but we must(!) avoid directory aliasing */ - actual = __d_unalias(inode, dentry, alias); + write_sequnlock(&rename_lock); if (IS_ERR(actual)) dput(alias); goto out_nolock; -- cgit v1.2.3-70-g09d2 From dc137bf553dbb6855bd7efc34fedcd03102455f7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 16 Jul 2011 23:37:20 -0400 Subject: cifs: build_path_from_dentry() race fix deal with d_move() races properly; rename_lock read-retry loop, rcu_read_lock() held while walking to root, d_lock held over subtraction from namelen and copying the component to stabilize ->d_name. Signed-off-by: Al Viro --- fs/cifs/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 81914df47ef..fa8c21d913b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -55,6 +55,7 @@ build_path_from_dentry(struct dentry *direntry) char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + unsigned seq; if (direntry == NULL) return NULL; /* not much we can do if dentry is freed and @@ -68,22 +69,29 @@ build_path_from_dentry(struct dentry *direntry) dfsplen = 0; cifs_bp_rename_retry: namelen = dfsplen; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { namelen += (1 + temp->d_name.len); temp = temp->d_parent; if (temp == NULL) { cERROR(1, "corrupt dentry"); + rcu_read_unlock(); return NULL; } } + rcu_read_unlock(); full_path = kmalloc(namelen+1, GFP_KERNEL); if (full_path == NULL) return full_path; full_path[namelen] = 0; /* trailing null */ + rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { + spin_lock(&temp->d_lock); namelen -= 1 + temp->d_name.len; if (namelen < 0) { + spin_unlock(&temp->d_lock); break; } else { full_path[namelen] = dirsep; @@ -91,14 +99,17 @@ cifs_bp_rename_retry: temp->d_name.len); cFYI(0, "name: %s", full_path + namelen); } + spin_unlock(&temp->d_lock); temp = temp->d_parent; if (temp == NULL) { cERROR(1, "corrupt dentry"); + rcu_read_unlock(); kfree(full_path); return NULL; } } - if (namelen != dfsplen) { + rcu_read_unlock(); + if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { cERROR(1, "did not end path lookup where expected namelen is %d", namelen); /* presumably this is only possible if racing with a rename -- cgit v1.2.3-70-g09d2 From 1b71fe2efa31cd18c865db474a4cd473b6ab5281 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 16 Jul 2011 23:43:58 -0400 Subject: ceph analog of cifs build_path_from_dentry() race fix ... unfortunately, cifs bug got copied. Fix is essentially the same. Signed-off-by: Al Viro --- fs/ceph/mds_client.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 79743d146be..0c1d9175652 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1438,12 +1438,15 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, struct dentry *temp; char *path; int len, pos; + unsigned seq; if (dentry == NULL) return ERR_PTR(-EINVAL); retry: len = 0; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp);) { struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) @@ -1455,10 +1458,12 @@ retry: len += 1 + temp->d_name.len; temp = temp->d_parent; if (temp == NULL) { + rcu_read_unlock(); pr_err("build_path corrupt dentry %p\n", dentry); return ERR_PTR(-EINVAL); } } + rcu_read_unlock(); if (len) len--; /* no leading '/' */ @@ -1467,9 +1472,12 @@ retry: return ERR_PTR(-ENOMEM); pos = len; path[pos] = 0; /* trailing null */ + rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { - struct inode *inode = temp->d_inode; + struct inode *inode; + spin_lock(&temp->d_lock); + inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, temp); @@ -1478,21 +1486,26 @@ retry: break; } else { pos -= temp->d_name.len; - if (pos < 0) + if (pos < 0) { + spin_unlock(&temp->d_lock); break; + } strncpy(path + pos, temp->d_name.name, temp->d_name.len); } + spin_unlock(&temp->d_lock); if (pos) path[--pos] = '/'; temp = temp->d_parent; if (temp == NULL) { + rcu_read_unlock(); pr_err("build_path corrupt dentry\n"); kfree(path); return ERR_PTR(-EINVAL); } } - if (pos != 0) { + rcu_read_unlock(); + if (pos != 0 || read_seqretry(&rename_lock, seq)) { pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a -- cgit v1.2.3-70-g09d2 From a803b8067e317832d6a251c5b0486e36a4f81922 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Jul 2011 20:56:55 -0400 Subject: fix exofs ->get_parent() NULL is not a possible return value for that method, TYVM... Signed-off-by: Al Viro --- fs/exofs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 06065bd37fc..c57beddcc21 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -913,7 +913,7 @@ struct dentry *exofs_get_parent(struct dentry *child) unsigned long ino = exofs_parent_ino(child); if (!ino) - return NULL; + return ERR_PTR(-ESTALE); return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); } -- cgit v1.2.3-70-g09d2 From 642c937b4ed2e51d2f2e4c46ab7cd8b5bddf268b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 Jul 2011 10:07:34 -0400 Subject: ufs should use d_splice_alias() it's NFS-exportable, so... Signed-off-by: Al Viro --- fs/ufs/namei.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 29309e25417..b57aab9a118 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,16 +56,12 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); - if (ino) { + if (ino) inode = ufs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - unlock_ufs(dir->i_sb); - return ERR_CAST(inode); - } - } unlock_ufs(dir->i_sb); - d_add(dentry, inode); - return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_splice_alias(inode, dentry); } /* -- cgit v1.2.3-70-g09d2 From 0577d1ba411f9c40693b8b3e4aa7e0892cd03091 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 Jul 2011 19:04:14 -0400 Subject: cramfs: get_cramfs_inode() returns ERR_PTR() on failure ... and we want to report these failures in ->lookup() anyway. Signed-off-by: Al Viro --- fs/cramfs/inode.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e141939080f..739fb59bcdc 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -37,7 +37,7 @@ static DEFINE_MUTEX(read_mutex); /* These macros may change in future, to provide better st_ino semantics. */ #define OFFSET(x) ((x)->i_ino) -static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset) +static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset) { if (!cino->offset) return offset + 1; @@ -61,7 +61,7 @@ static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset) } static struct inode *get_cramfs_inode(struct super_block *sb, - struct cramfs_inode *cramfs_inode, unsigned int offset) + const struct cramfs_inode *cramfs_inode, unsigned int offset) { struct inode *inode; static struct timespec zerotime; @@ -317,7 +317,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* Set it all up.. */ sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, &super.root, 0); - if (!root) + if (IS_ERR(root)) goto out; sb->s_root = d_alloc_root(root); if (!sb->s_root) { @@ -423,6 +423,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { unsigned int offset = 0; + struct inode *inode = NULL; int sorted; mutex_lock(&read_mutex); @@ -449,8 +450,8 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s for (;;) { if (!namelen) { - mutex_unlock(&read_mutex); - return ERR_PTR(-EIO); + inode = ERR_PTR(-EIO); + goto out; } if (name[namelen-1]) break; @@ -462,17 +463,18 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s if (retval > 0) continue; if (!retval) { - struct cramfs_inode entry = *de; - mutex_unlock(&read_mutex); - d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off)); - return NULL; + inode = get_cramfs_inode(dir->i_sb, de, dir_off); + break; } /* else (retval < 0) */ if (sorted) break; } +out: mutex_unlock(&read_mutex); - d_add(dentry, NULL); + if (IS_ERR(inode)) + return ERR_CAST(inode); + d_add(dentry, inode); return NULL; } -- cgit v1.2.3-70-g09d2 From 3cc0658e35124ace881f6942839dcae877c3eaed Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 Jul 2011 22:24:15 -0400 Subject: hppfs: fix dentry leak Signed-off-by: Al Viro --- fs/hppfs/hppfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 87ed48e0343..7d6a0e92bcf 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -174,13 +174,11 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, err = -ENOMEM; inode = get_inode(ino->i_sb, proc_dentry); if (!inode) - goto out_dput; + goto out; d_add(dentry, inode); return NULL; - out_dput: - dput(proc_dentry); out: return ERR_PTR(err); } @@ -690,8 +688,10 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) struct inode *proc_ino = dentry->d_inode; struct inode *inode = new_inode(sb); - if (!inode) + if (!inode) { + dput(dentry); return ERR_PTR(-ENOMEM); + } if (S_ISDIR(dentry->d_inode->i_mode)) { inode->i_op = &hppfs_dir_iops; @@ -704,7 +704,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) inode->i_fop = &hppfs_file_fops; } - HPPFS_I(inode)->proc_dentry = dget(dentry); + HPPFS_I(inode)->proc_dentry = dentry; inode->i_uid = proc_ino->i_uid; inode->i_gid = proc_ino->i_gid; @@ -737,7 +737,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) sb->s_fs_info = proc_mnt; err = -ENOMEM; - root_inode = get_inode(sb, proc_mnt->mnt_sb->s_root); + root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root)); if (!root_inode) goto out_mntput; -- cgit v1.2.3-70-g09d2 From 0916a5e45fbd2604a303c8cc18e6b2b7c815e4c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 Jul 2011 22:27:22 -0400 Subject: hppfs_lookup(): don't open-code lookup_one_len() ... and it's getting it wrong, too - missing ->d_revalidate() calls when it's dealing with filesystem (procfs) that has non-trivial ->d_revalidate()... Signed-off-by: Al Viro --- fs/hppfs/hppfs.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 7d6a0e92bcf..85c098a499f 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -139,7 +139,8 @@ static int file_removed(struct dentry *dentry, const char *file) static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry, *new, *parent; + struct dentry *proc_dentry, *parent; + struct qstr *name = &dentry->d_name; struct inode *inode; int err, deleted; @@ -149,23 +150,9 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, else if (deleted) return ERR_PTR(-ENOENT); - err = -ENOMEM; parent = HPPFS_I(ino)->proc_dentry; mutex_lock(&parent->d_inode->i_mutex); - proc_dentry = d_lookup(parent, &dentry->d_name); - if (proc_dentry == NULL) { - proc_dentry = d_alloc(parent, &dentry->d_name); - if (proc_dentry == NULL) { - mutex_unlock(&parent->d_inode->i_mutex); - goto out; - } - new = (*parent->d_inode->i_op->lookup)(parent->d_inode, - proc_dentry, NULL); - if (new) { - dput(proc_dentry); - proc_dentry = new; - } - } + proc_dentry = lookup_one_len(name->name, parent, name->len); mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(proc_dentry)) -- cgit v1.2.3-70-g09d2 From fec11dd9a0109fe52fd631e5c510778d6cbff6cc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 18 Jul 2011 13:50:40 -0400 Subject: Fix cifs_get_root() Add missing ->i_mutex, convert to lookup_one_len() instead of (broken) open-coded analog, cope with getting something like a//b as relative pathname. Simplify the hell out of it, while we are there... Signed-off-by: Al Viro Reviewed-by: Jeff Layton --- fs/cifs/cifsfs.c | 100 ++++++++++++++++--------------------------------------- 1 file changed, 29 insertions(+), 71 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3e298997629..bc4b12ca537 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "cifsfs.h" #include "cifspdu.h" @@ -542,14 +543,12 @@ static const struct super_operations cifs_super_ops = { static struct dentry * cifs_get_root(struct smb_vol *vol, struct super_block *sb) { - int xid, rc; - struct inode *inode; - struct qstr name; - struct dentry *dparent = NULL, *dchild = NULL, *alias; + struct dentry *dentry; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - unsigned int i, full_len, len; - char *full_path = NULL, *pstart; + char *full_path = NULL; + char *s, *p; char sep; + int xid; full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); @@ -560,73 +559,32 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) xid = GetXid(); sep = CIFS_DIR_SEP(cifs_sb); - dparent = dget(sb->s_root); - full_len = strlen(full_path); - full_path[full_len] = sep; - pstart = full_path + 1; - - for (i = 1, len = 0; i <= full_len; i++) { - if (full_path[i] != sep || !len) { - len++; - continue; - } - - full_path[i] = 0; - cFYI(1, "get dentry for %s", pstart); - - name.name = pstart; - name.len = len; - name.hash = full_name_hash(pstart, len); - dchild = d_lookup(dparent, &name); - if (dchild == NULL) { - cFYI(1, "not exists"); - dchild = d_alloc(dparent, &name); - if (dchild == NULL) { - dput(dparent); - dparent = ERR_PTR(-ENOMEM); - goto out; - } - } - - cFYI(1, "get inode"); - if (dchild->d_inode == NULL) { - cFYI(1, "not exists"); - inode = NULL; - if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) - rc = cifs_get_inode_info_unix(&inode, full_path, - sb, xid); - else - rc = cifs_get_inode_info(&inode, full_path, - NULL, sb, xid, NULL); - if (rc) { - dput(dchild); - dput(dparent); - dparent = ERR_PTR(rc); - goto out; - } - alias = d_materialise_unique(dchild, inode); - if (alias != NULL) { - dput(dchild); - if (IS_ERR(alias)) { - dput(dparent); - dparent = ERR_PTR(-EINVAL); /* XXX */ - goto out; - } - dchild = alias; - } - } - cFYI(1, "parent %p, child %p", dparent, dchild); - - dput(dparent); - dparent = dchild; - len = 0; - pstart = full_path + i + 1; - full_path[i] = sep; - } -out: + dentry = dget(sb->s_root); + p = s = full_path; + + do { + struct inode *dir = dentry->d_inode; + struct dentry *child; + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + mutex_lock(&dir->i_mutex); + child = lookup_one_len(p, dentry, s - p); + mutex_unlock(&dir->i_mutex); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); _FreeXid(xid); kfree(full_path); - return dparent; + return dentry; } static int cifs_set_super(struct super_block *sb, void *data) -- cgit v1.2.3-70-g09d2 From 59430262401bec02d415179c43dbe5b8819c09ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 18 Jul 2011 15:43:29 -0700 Subject: vfs: fix race in rcu lookup of pruned dentry Don't update *inode in __follow_mount_rcu() until we'd verified that there is mountpoint there. Kudos to Hugh Dickins for catching that one in the first place and eventually figuring out the solution (and catching a braino in the earlier version of patch). Signed-off-by: Linus Torvalds Signed-off-by: Al Viro --- fs/namei.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 5c867dd1c0b..14ab8d3f2f0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -942,7 +942,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - *inode = path->dentry->d_inode; if (unlikely(managed_dentry_might_block(path->dentry))) return false; @@ -955,6 +954,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = mounted; path->dentry = mounted->mnt_root; nd->seq = read_seqcount_begin(&path->dentry->d_seq); + /* + * Update the inode too. We don't need to re-check the + * dentry sequence number here after this d_inode read, + * because a mount-point is always pinned. + */ + *inode = path->dentry->d_inode; } return true; } -- cgit v1.2.3-70-g09d2 From f7b88631a89757d70192044c9d9f2e8d2fc02f2c Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 19 Jul 2011 08:49:25 -0700 Subject: fs/libfs.c: fix simple_attr_write() on 32bit machines Assume that /sys/kernel/debug/dummy64 is debugfs file created by debugfs_create_x64(). # cd /sys/kernel/debug # echo 0x1234567812345678 > dummy64 # cat dummy64 0x0000000012345678 # echo 0x80000000 > dummy64 # cat dummy64 0xffffffff80000000 A value larger than INT_MAX cannot be written to the debugfs file created by debugfs_create_u64 or debugfs_create_x64 on 32bit machine. Because simple_attr_write() uses simple_strtol() for the conversion. To fix this, use simple_strtoll() instead. Signed-off-by: Akinobu Mita Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index c88eab55aec..275ca4749a2 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -822,7 +822,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - val = simple_strtol(attr->set_buf, NULL, 0); + val = simple_strtoll(attr->set_buf, NULL, 0); ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ -- cgit v1.2.3-70-g09d2 From 2cebaa58b7de775386732bbd6cd11c3f5b73faf0 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 20 Jul 2011 18:24:09 +0400 Subject: CIFS: Fix wrong length in cifs_iovec_read Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bb71471a4d9..a9b4a24f2a1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1737,7 +1737,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, io_parms.pid = pid; io_parms.tcon = pTcon; io_parms.offset = *poffset; - io_parms.length = len; + io_parms.length = cur_len; rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &read_data, &buf_type); pSMBr = (struct smb_com_read_rsp *)read_data; -- cgit v1.2.3-70-g09d2 From b307d4655a71749ac3f91c6dbe33d28cc026ceeb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 21 Jul 2011 15:02:43 +0100 Subject: FS-Cache: Fix __fscache_uncache_all_inode_pages()'s outer loop The compiler, at least for ix86 and m68k, validly warns that the comparison: next <= (loff_t)-1 is always true (and it's always true also for x86-64 and probably all other arches - as long as pgoff_t isn't wider than loff_t). The intention appears to be to avoid wrapping of "next", so rather than eliminating the pointless comparison, fix the loop to indeed get exited when "next" would otherwise wrap. On m68k the following warning is observed: fs/fscache/page.c: In function '__fscache_uncache_all_inode_pages': fs/fscache/page.c:979: warning: comparison is always false due to limited range of data type Reported-by: Geert Uytterhoeven Reported-by: Jan Beulich Signed-off-by: Jan Beulich Signed-off-by: David Howells Cc: Suresh Jayaraman Cc: Geert Uytterhoeven Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/fscache/page.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 2f343b4d7a7..3f7a59bfa7a 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -976,16 +976,12 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; - while (next <= (loff_t)-1 && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE) - ) { + do { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - - ASSERTCMP(page_index, >=, next); - next = page_index + 1; - + next = page->index; if (PageFsCache(page)) { __fscache_wait_on_page_write(cookie, page); __fscache_uncache_page(cookie, page); @@ -993,7 +989,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, } pagevec_release(&pvec); cond_resched(); - } + } while (++next); _leave(""); } -- cgit v1.2.3-70-g09d2 From b91da88fed84843313a1b6fd1b1c834a24bbcf9e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 21 Jul 2011 11:01:42 -0700 Subject: vfs: drop conditional inode prefetch in __do_lookup_rcu It seems to hurt performance in real life. Yes, the inode will be used later, but the conditional doesn't seem to predict all that well (negative dentries are not uncommon) and it looks like the cost of prefetching is simply higher than depending on the cache doing the right thing. As usual. Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 6e4ea6d8777..fbdcbca4072 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1813,8 +1813,6 @@ seqretry: tname = dentry->d_name.name; i = dentry->d_inode; prefetch(tname); - if (i) - prefetch(i); /* * This seqcount check is required to ensure name and * len are loaded atomically, so as not to walk off the -- cgit v1.2.3-70-g09d2