From 31978b5cc66b8ba8a7e8eef60b12395d41b7b890 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 31 Oct 2013 21:00:10 +0300 Subject: xfs: underflow bug in xfs_attrlist_by_handle() If we allocate less than sizeof(struct attrlist) then we end up corrupting memory or doing a ZERO_PTR_SIZE dereference. This can only be triggered with CAP_SYS_ADMIN. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit 071c529eb672648ee8ca3f90944bcbcc730b4c06) --- fs/xfs/xfs_ioctl.c | 3 ++- fs/xfs/xfs_ioctl32.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4d613401a5e..33ad9a77791 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -442,7 +442,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e8fb1231db8..a7992f8de9d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* -- cgit v1.2.3-70-g09d2 From 2f42d612e7d4c4fb1819ea7b2b6e18938714ae7a Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 20 Nov 2013 16:08:53 +0800 Subject: xfs: don't perform discard if the given range length is less than block size For discard operation, we should return EINVAL if the given range length is less than a block size, otherwise it will go through the file system to discard data blocks as the end range might be evaluated to -1, e.g, # fstrim -v -o 0 -l 100 /xfs7 /xfs7: 9811378176 bytes were trimmed This issue can be triggered via xfstests/generic/288. Also, it seems to get the request queue pointer via bdev_get_queue() instead of the hard code pointer dereference is not a bad thing. Signed-off-by: Jie Liu Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers (cherry picked from commit f9fd0135610084abef6867d984e9951c3099950d) --- fs/xfs/xfs_discard.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8367d6dc18c..4f11ef01113 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -157,7 +157,7 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; xfs_daddr_t start, end, minlen; @@ -180,7 +180,8 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) return -XFS_ERROR(EINVAL); start = BTOBB(range.start); -- cgit v1.2.3-70-g09d2 From f94c44573e7c22860e2c3dfe349c45f72ba35ad3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Nov 2013 15:41:06 +1100 Subject: xfs: growfs overruns AGFL buffer on V4 filesystems This loop in xfs_growfs_data_private() is incorrect for V4 superblocks filesystems: for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); For V4 filesystems, we don't have a agfl header structure, and so XFS_AGFL_SIZE() returns an entire sector's worth of entries, which we then index from an offset into the sector. Hence: buffer overrun. This problem was introduced in 3.10 by commit 77c95bba ("xfs: add CRC checks to the AGFL") which changed the AGFL structure but failed to update the growfs code to handle the different structures. Fix it by using the correct offset into the buffer for both V4 and V5 filesystems. Cc: Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Ben Myers (cherry picked from commit b7d961b35b3ab69609aeea93f870269cb6e7ba4d) --- fs/xfs/xfs_fsops.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a6e54b3319b..02fb943cbf2 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -220,6 +220,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -279,8 +281,10 @@ xfs_growfs_data_private( agfl->agfl_seqno = cpu_to_be32(agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); -- cgit v1.2.3-70-g09d2 From 781c2a5a5f75eacc04663aced0f0f1a648d4f308 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 2 Dec 2013 15:26:19 -0500 Subject: nfsd: when reusing an existing repcache entry, unhash it first The DRC code will attempt to reuse an existing, expired cache entry in preference to allocating a new one. It'll then search the cache, and if it gets a hit it'll then free the cache entry that it was going to reuse. The cache code doesn't unhash the entry that it's going to reuse however, so it's possible for it end up designating an entry for reuse and then subsequently freeing the same entry after it finds it. This leads it to a later use-after-free situation and usually some list corruption warnings or an oops. Fix this by simply unhashing the entry that we intend to reuse. That will mean that it's not findable via a search and should prevent this situation from occurring. Cc: stable@vger.kernel.org # v3.10+ Reported-by: Christoph Hellwig Reported-by: g. artim Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 9186c7ce0b1..b6af150c96b 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -131,6 +131,13 @@ nfsd_reply_cache_alloc(void) return rp; } +static void +nfsd_reply_cache_unhash(struct svc_cacherep *rp) +{ + hlist_del_init(&rp->c_hash); + list_del_init(&rp->c_lru); +} + static void nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { @@ -417,7 +424,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); if (nfsd_cache_entry_expired(rp) || num_drc_entries >= max_drc_entries) { - lru_put_end(rp); + nfsd_reply_cache_unhash(rp); prune_cache_entries(); goto search_cache; } -- cgit v1.2.3-70-g09d2 From a7e252af5a819eb71c9494e62b2bfca982e92f84 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 22 Nov 2013 18:47:59 +0800 Subject: Btrfs: don't clear the default compression type We met a oops caused by the wrong compression type: [ 556.512356] BUG: unable to handle kernel NULL pointer dereference at (null) [ 556.512370] IP: [] __list_del_entry+0x1/0x98 [SNIP] [ 556.512490] [] ? list_del+0xd/0x2b [ 556.512539] [] find_workspace+0x97/0x175 [btrfs] [ 556.512546] [] ? _raw_spin_lock+0xe/0x10 [ 556.512576] [] btrfs_compress_pages+0x2d/0xa2 [btrfs] [ 556.512601] [] compress_file_range.constprop.54+0x1f2/0x4e8 [btrfs] [ 556.512627] [] async_cow_start+0x32/0x4d [btrfs] [ 556.512655] [] worker_loop+0x144/0x4c3 [btrfs] [ 556.512661] [] ? finish_task_switch+0x80/0xb8 [ 556.512689] [] ? btrfs_queue_worker+0x244/0x244 [btrfs] [ 556.512695] [] kthread+0x8d/0x95 [ 556.512699] [] ? bit_waitqueue+0x34/0x7d [ 556.512704] [] ? __kthread_parkme+0x65/0x65 [ 556.512709] [] ret_from_fork+0x7c/0xb0 [ 556.512713] [] ? __kthread_parkme+0x65/0x65 Steps to reproduce: # mkfs.btrfs -f # mount -o nodatacow # touch / # chattr =c / # dd if=/dev/zero of=/ bs=1M count=10 It is because we cleared the default compression type when setting the nodatacow. In fact, we needn't do it because we have used COMPRESS flag to indicate if we need compressed the file data or not, needn't use the variant -- compress_type -- in btrfs_info to do the same thing, and just use it to hold the default compression type. Or we would get a wrong compress type for a file whose own compress flag is set but the compress flag of its filesystem is not set. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2d8ac1bf0cf..d71a11d13df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -432,7 +432,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } else { printk(KERN_INFO "btrfs: setting nodatacow\n"); } - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); @@ -461,7 +460,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; @@ -474,9 +472,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); pr_info("btrfs: force %s compression\n", compress_type); - } else + } else if (btrfs_test_opt(root, COMPRESS)) { pr_info("btrfs: use %s compression\n", compress_type); + } break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); -- cgit v1.2.3-70-g09d2 From e43f998e47bae27e37e159915625e8d4b130153b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 6 Dec 2013 17:51:32 +0100 Subject: btrfs: call mnt_drop_write after interrupted subvol deletion If btrfs_ioctl_snap_destroy blocks on the mutex and the process is killed, mnt_write count is unbalanced and leads to unmountable filesystem. CC: stable@vger.kernel.org Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a111622598b..21da5762b0b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2121,7 +2121,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) - goto out; + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2284,6 +2284,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); -- cgit v1.2.3-70-g09d2 From 639eefc8afd1b8b839b1fd5605378d869d3612a1 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Sun, 8 Dec 2013 00:26:29 +0000 Subject: Btrfs: don't miss skinny extent items on delayed ref head contention Currently extent-tree.c:btrfs_lookup_extent_info() can miss the lookup of skinny extent items. This can happen when the execution flow is the following: * We do an extent tree lookup and fail to find a skinny extent item; * As a result, we attempt to see if a non-skinny extent item exists, either by looking at previous item in the leaf or by doing another full extent tree search; * We have a transaction and then we check for a matching delayed ref head in the transaction's delayed refs rbtree; * We find such delayed ref head and then we try to lock it with a call to mutex_trylock(); * The lock was contended so we jump to the label "again", which repeats the extent tree search but for a non-skinny extent item, because we set previously metadata variable to 0 and the search key to look for a non-skinny extent-item; * After the jump (and after releasing the transaction's delayed refs lock), a skinny extent item might have been added to the extent tree but we will miss it because metadata is set to 0 and the search key is set for a non-skinny extent-item. The fix here is to not reset metadata to 0 and to jump to the initial search key setup if the delayed ref head is contended, instead of jumping directly to the extent tree search label ("again"). This issue was found while investigating the issue reported at Bugzilla 64961. David Sterba suspected this function was missing extent items, and that this could be caused by the last change to this function, which was made in the following patch: [PATCH] Btrfs: optimize btrfs_lookup_extent_info() (commit 74be9510876a66ad9826613ac8a526d26f9e7f01) But in fact this issue already existed before, because after failing to find a skinny extent item, the code set the search key for a non-skinny extent item, and on contention of a matching delayed ref head it would not search the extent tree for a skinny extent item anymore. Signed-off-by: Filipe David Borba Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45d98d01028..9c01509dd8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -767,20 +767,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (metadata) { - key.objectid = bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = offset; - } else { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = offset; - } - if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); @@ -788,7 +787,6 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - metadata = 0; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -855,7 +853,7 @@ again: mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; -- cgit v1.2.3-70-g09d2 From c974c4642fee8c58239c7753d0bf548b23799923 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 11 Dec 2013 19:29:51 +0800 Subject: Btrfs: fix an oops when doing balance relocation I hit an oops when inserting reloc root into @reloc_root_tree(it can be easily triggered when forcing cow for relocation root) [ 866.494539] [] btrfs_init_reloc_root+0x79/0xb0 [btrfs] [ 866.495321] [] record_root_in_trans+0xb0/0x110 [btrfs] [ 866.496109] [] btrfs_record_root_in_trans+0x48/0x80 [btrfs] [ 866.496908] [] select_reloc_root+0xa8/0x210 [btrfs] [ 866.497703] [] do_relocation+0x16a/0x540 [btrfs] This is because reloc root inserted into @reloc_root_tree is not within one transaction,reloc root may be cowed and root block bytenr will be reused then oops happens.We should update reloc root in @reloc_root_tree when cow reloc root node, fix it. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 70 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ce459a7cb16..7cdc76007c6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1264,10 +1264,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) } /* - * helper to update/delete the 'address of tree root -> reloc tree' + * helper to delete the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, int del) +static void __del_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1275,7 +1275,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + root->node->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1283,23 +1283,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_unlock(&rc->reloc_root_tree.lock); if (!node) - return 0; + return; BUG_ON((struct btrfs_root *)node->data != root); - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - } else { - spin_lock(&root->fs_info->trans_lock); - list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); - kfree(node); + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); return 0; } @@ -1420,7 +1442,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; - int del = 0; int ret; if (!root->reloc_root) @@ -1432,11 +1453,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (root->fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; - del = 1; + __del_reloc_root(reloc_root); } - __update_reloc_root(reloc_root, del); - if (reloc_root->commit_root != reloc_root->node) { btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); @@ -2287,7 +2306,7 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); - __update_reloc_root(reloc_root, 1); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); kfree(reloc_root); @@ -2332,7 +2351,7 @@ again: ret = merge_reloc_root(rc, root); if (ret) { - __update_reloc_root(reloc_root, 1); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); kfree(reloc_root); @@ -4522,6 +4541,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) -- cgit v1.2.3-70-g09d2 From 6646374863508e24da7c7d21527f8dadc8687ff9 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 10 Dec 2013 00:14:34 +0800 Subject: Btrfs: skip building backref tree for uuid and quota tree when doing balance relocation Quota tree and UUID Tree is only cowed, they can not be snapshoted. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7cdc76007c6..0a3f0256205 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -571,7 +571,9 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_CHUNK_TREE_OBJECTID || root_objectid == BTRFS_DEV_TREE_OBJECTID || root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) return 1; return 0; } -- cgit v1.2.3-70-g09d2 From 467bb1d27c0b783b73e6349304c0d90b5b4f431b Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 11 Dec 2013 19:29:52 +0800 Subject: Btrfs: make sure we cleanup all reloc roots if error happens I hit an oops when merging reloc roots fails, the reason is that new reloc roots may be added and we should make sure we cleanup all reloc roots. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0a3f0256205..429c73c374b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2409,6 +2409,13 @@ out: btrfs_std_error(root->fs_info, ret); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); -- cgit v1.2.3-70-g09d2 From 700ff4f095d78af0998953e922e041d75254518b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 10 Jan 2013 03:57:25 -0500 Subject: Btrfs: fix access_ok() check in btrfs_ioctl_send() The closing parenthesis is in the wrong place. We want to check "sizeof(*arg->clone_sources) * arg->clone_sources_count" instead of "sizeof(*arg->clone_sources * arg->clone_sources_count)". Signed-off-by: Dan Carpenter Reviewed-by: Jie Liu Signed-off-by: Chris Mason cc: stable@vger.kernel.org --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6837fe87f3a..945d1db98f2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4723,8 +4723,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources * - arg->clone_sources_count))) { + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { ret = -EFAULT; goto out; } -- cgit v1.2.3-70-g09d2 From a5c21dcefa1c3d759457a604b3cfc4af29c8713f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Dec 2013 17:40:21 +0000 Subject: dcache: allow word-at-a-time name hashing with big-endian CPUs When explicitly hashing the end of a string with the word-at-a-time interface, we have to be careful which end of the word we pick up. On big-endian CPUs, the upper-bits will contain the data we're after, so ensure we generate our masks accordingly (and avoid hashing whatever random junk may have been sitting after the string). This patch adds a new dcache helper, bytemask_from_count, which creates a mask appropriate for the CPU endianness. Cc: Al Viro Signed-off-by: Will Deacon Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/namei.c | 7 +------ include/linux/dcache.h | 2 ++ 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 4bdb300b16e..6055d61811d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -192,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char if (!tcount) return 0; } - mask = ~(~0ul << tcount*8); + mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } diff --git a/fs/namei.c b/fs/namei.c index c53d3a9547f..3531deebad3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1598,11 +1598,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1646,7 +1641,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 57e87e749a4..bf72e9ac6de 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -29,8 +29,10 @@ struct vfsmount; /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len; + #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash; + #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* -- cgit v1.2.3-70-g09d2 From ae5758a1a7b7bdfdcfaf2d78a5a0f8675149dd79 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Dec 2013 17:12:22 -0800 Subject: procfs: also fix proc_reg_get_unmapped_area() for !MMU case Commit fad1a86e25e0 ("procfs: call default get_unmapped_area on MMU-present architectures"), as its title says, took care of only the MMU case, leaving the !MMU side still in the regressed state (returning -EIO in all cases where pde->proc_fops->get_unmapped_area is NULL). From the fad1a86e25e0 changelog: "Commit c4fe24485729 ("sparc: fix PCI device proc file mmap(2)") added proc_reg_get_unmapped_area in proc_reg_file_ops and proc_reg_file_ops_no_compat, by which now mmap always returns EIO if get_unmapped_area method is not defined for the target procfs file, which causes regression of mmap on /proc/vmcore. To address this issue, like get_unmapped_area(), call default current->mm->get_unmapped_area on MMU-present architectures if pde->proc_fops->get_unmapped_area, i.e. the one in actual file operation in the procfs file, is not defined" Signed-off-by: Jan Beulich Cc: HATAYAMA Daisuke Cc: Alexey Dobriyan Cc: David S. Miller Cc: [3.12.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 28955d4b721..124fc43c709 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -292,16 +292,20 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long) = NULL; + if (use_pde(pde)) { + typeof(proc_reg_get_unmapped_area) *get_area; + + get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif - if (pde->proc_fops->get_unmapped_area) - get_area = pde->proc_fops->get_unmapped_area; + if (get_area) rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; unuse_pde(pde); } return rv; -- cgit v1.2.3-70-g09d2 From 86b58d13134ef14f09f8c8f37797ccc37cf823a3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 5 Dec 2013 12:38:59 +0800 Subject: ceph: initialize inode before instantiating dentry commit b18825a7c8 (Put a small type field into struct dentry::d_flags) put a type field into struct dentry::d_flags. __d_instantiate() set the field by checking inode->i_mode. So we should initialize inode before instantiating dentry when handling mds reply. Fixes: http://tracker.ceph.com/issues/6930 Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/inode.c | 136 ++++++++++++++++++++++++-------------------------------- 1 file changed, 58 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9a8e396aed8..278fd289128 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -978,7 +978,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - int i = 0; int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -1039,6 +1038,29 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } } + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + (le32_to_cpu(rinfo->head->result) == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } + } + /* * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later @@ -1108,7 +1130,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; } /* null dentry? */ @@ -1130,44 +1151,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = dn->d_inode; - if (!in) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_drop(dn); - goto done; - } + if (!dn->d_inode) { + ihold(in); dn = splice_dentry(dn, in, &have_lease, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { - ihold(in); - } else { + } else if (dn->d_inode && dn->d_inode != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + dn, dn->d_inode, ceph_vinop(dn->d_inode), + ceph_vinop(in)); have_lease = false; - in = NULL; } if (have_lease) update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); dout(" final dn %p\n", dn); - i++; - } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; /* fill out a snapdir LOOKUPSNAP dentry */ @@ -1177,52 +1182,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } dout(" linking snapped dir %p to dn %p\n", in, dn); + ihold(in); dn = splice_dentry(dn, in, NULL, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ - } - - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; - - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); - goto done; - } } - done: dout("fill_trace done err=%d\n", err); return err; @@ -1272,7 +1240,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; + int err = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; @@ -1305,6 +1273,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); } + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1329,9 +1298,10 @@ retry_lookup: err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) { + ret = ceph_init_dentry(dn); + if (ret < 0) { dput(dn); + err = ret; goto out; } } else if (dn->d_inode && @@ -1351,9 +1321,6 @@ retry_lookup: spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); - /* inode */ if (dn->d_inode) { in = dn->d_inode; @@ -1366,26 +1333,39 @@ retry_lookup: err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL, false); - if (IS_ERR(dn)) - dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); + if (!dn->d_inode) + iput(in); + d_drop(dn); goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + + if (!dn->d_inode) { + dn = splice_dentry(dn, in, NULL, false); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + dn = NULL; + goto next_item; + } + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); next_item: if (dn) dput(dn); } - req->r_did_prepopulate = true; + if (err == 0) + req->r_did_prepopulate = true; out: if (snapdir) { -- cgit v1.2.3-70-g09d2 From 56f91aad69444d650237295f68c195b74d888d95 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 13 Nov 2013 15:22:14 +0800 Subject: ceph: Avoid data inconsistency due to d-cache aliasing in readpage() If the length of data to be read in readpage() is exactly PAGE_CACHE_SIZE, the original code does not flush d-cache for data consistency after finishing reading. This patches fixes this. Signed-off-by: Li Wang Signed-off-by: Sage Weil --- fs/ceph/addr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1e561c05953..ec3ba43b9fa 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } else { + if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } else { + flush_dcache_page(page); + } } SetPageUptodate(page); -- cgit v1.2.3-70-g09d2 From 3a8c92086d1c05ae1cf6a52ffd8dde2851e6b1d3 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Sat, 5 Oct 2013 21:48:25 -0500 Subject: xfs: fix memory leak in xfs_dir2_node_removename Fix the leak of kernel memory in xfs_dir2_node_removename() when xfs_dir2_leafn_remove() returns an error code. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit ef701600fd26cace9d513ee174688a2b83832126) --- fs/xfs/xfs_dir2_node.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 56369d4509d..48c7d18f68c 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup( */ int /* error */ xfs_dir2_node_removename( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { - xfs_da_state_blk_t *blk; /* leaf block */ + struct xfs_da_state_blk *blk; /* leaf block */ int error; /* error return value */ int rval; /* operation return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_da_state *state; /* btree cursor */ trace_xfs_dir2_node_removename(args); @@ -2084,19 +2084,18 @@ xfs_dir2_node_removename( state->mp = args->dp->i_mount; state->blocksize = state->mp->m_dirblksize; state->node_ents = state->mp->m_dir_node_ents; - /* - * Look up the entry we're deleting, set up the cursor. - */ + + /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); if (error) - rval = error; - /* - * Didn't find it, upper layer screwed up. - */ + goto out_free; + + /* Didn't find it, upper layer screwed up. */ if (rval != EEXIST) { - xfs_da_state_free(state); - return rval; + error = rval; + goto out_free; } + blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(state->extravalid); @@ -2107,7 +2106,7 @@ xfs_dir2_node_removename( error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); if (error) - return error; + goto out_free; /* * Fix the hash values up the btree. */ @@ -2122,6 +2121,7 @@ xfs_dir2_node_removename( */ if (!error) error = xfs_dir2_node_to_leaf(state); +out_free: xfs_da_state_free(state); return error; } -- cgit v1.2.3-70-g09d2 From 30d161c9aacb4b719789c30623b9eec7d1aa1d08 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Tue, 26 Nov 2013 21:38:54 +0800 Subject: xfs: fix false assertion at xfs_qm_vop_create_dqattach After the previous fix, there still has another ASSERT failure if turning off any type of quota while fsstress is running at the same time. Backtrace in this case: [ 50.867897] XFS: Assertion failed: XFS_IS_GQUOTA_ON(mp), file: fs/xfs/xfs_qm.c, line: 2118 [ 50.867924] ------------[ cut here ]------------ ... [ 50.867957] Kernel BUG at ffffffffa0b55a32 [verbose debug info unavailable] [ 50.867999] invalid opcode: 0000 [#1] SMP [ 50.869407] Call Trace: [ 50.869446] [] xfs_qm_vop_create_dqattach+0x19a/0x2d0 [xfs] [ 50.869512] [] xfs_create+0x5c5/0x6a0 [xfs] [ 50.869564] [] xfs_vn_mknod+0xac/0x1d0 [xfs] [ 50.869615] [] xfs_vn_mkdir+0x16/0x20 [xfs] [ 50.869655] [] vfs_mkdir+0x95/0x130 [ 50.869689] [] SyS_mkdirat+0xaa/0xe0 [ 50.869723] [] SyS_mkdir+0x19/0x20 [ 50.869757] [] system_call_fastpath+0x1a/0x1f [ 50.869793] Code: 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 [ 50.870003] RIP [] assfail+0x22/0x30 [xfs] [ 50.870050] RSP [ 50.879251] ---[ end trace c93a2b342341c65b ]--- We're hitting the ASSERT(XFS_IS_*QUOTA_ON(mp)) in xfs_qm_vop_create_dqattach(), however the assertion itself is not right IMHO. While performing quota off, we firstly clear the XFS_*QUOTA_ACTIVE bit(s) from struct xfs_mount without taking any special locks, see xfs_qm_scall_quotaoff(). Hence there is no guarantee that the desired quota is still active. Signed-off-by: Jie Liu Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers (cherry picked from commit 37eb9706ebf5b99d14c6086cdeef2c2f73f9c9fb) --- fs/xfs/xfs_qm.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 14a4996cfec..588e4909c58 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -2082,24 +2082,21 @@ xfs_qm_vop_create_dqattach( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if (udqp) { + if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (gdqp) { + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(XFS_IS_GQUOTA_ON(mp)); ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (pdqp) { + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(XFS_IS_PQUOTA_ON(mp)); ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); -- cgit v1.2.3-70-g09d2 From 5c22727895bf61cb851835be0d30260fb36de648 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Tue, 26 Nov 2013 21:38:34 +0800 Subject: xfs: fix assertion failure at xfs_setattr_nonsize For CRC enabled v5 super block, change a file's ownership can simply trigger an ASSERT failure at xfs_setattr_nonsize() if both group and project quota are enabled, i.e, [ 305.337609] XFS: Assertion failed: !XFS_IS_PQUOTA_ON(mp), file: fs/xfs/xfs_iops.c, line: 621 [ 305.339250] Kernel BUG at ffffffffa0a7fa32 [verbose debug info unavailable] [ 305.383939] Call Trace: [ 305.385536] [] xfs_setattr_nonsize+0x69a/0x720 [xfs] [ 305.387142] [] xfs_vn_setattr+0x29/0x70 [xfs] [ 305.388727] [] notify_change+0x1a8/0x350 [ 305.390298] [] chown_common+0xfd/0x110 [ 305.391868] [] SyS_fchownat+0xaf/0x110 [ 305.393440] [] SyS_lchown+0x20/0x30 [ 305.394995] [] system_call_fastpath+0x1a/0x1f [ 305.399870] RIP [] assfail+0x22/0x30 [xfs] This fix adjust the assertion to check if the super block support both quota inodes or not. Signed-off-by: Jie Liu Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers (cherry picked from commit 5a01dd54f4a7fb513062070c5acef20d13cad980) --- fs/xfs/xfs_iops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 27e0e544e96..104455b8046 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -618,7 +618,8 @@ xfs_setattr_nonsize( } if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, -- cgit v1.2.3-70-g09d2 From 718cc6f88cbfc4fbd39609f28c4c86883945f90d Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Tue, 26 Nov 2013 21:38:49 +0800 Subject: xfs: fix infinite loop by detaching the group/project hints from user dquot xfs_quota(8) will hang up if trying to turn group/project quota off before the user quota is off, this could be 100% reproduced by: # mount -ouquota,gquota /dev/sda7 /xfs # mkdir /xfs/test # xfs_quota -xc 'off -g' /xfs <-- hangs up # echo w > /proc/sysrq-trigger # dmesg SysRq : Show Blocked State task PC stack pid father xfs_quota D 0000000000000000 0 27574 2551 0x00000000 [snip] Call Trace: [] schedule+0xad/0xc0 [] schedule_timeout+0x35e/0x3c0 [] ? mark_held_locks+0x176/0x1c0 [] ? call_timer_fn+0x2c0/0x2c0 [] ? xfs_qm_shrink_count+0x30/0x30 [xfs] [] schedule_timeout_uninterruptible+0x26/0x30 [] xfs_qm_dquot_walk+0x235/0x260 [xfs] [] ? xfs_perag_get+0x1d8/0x2d0 [xfs] [] ? xfs_perag_get+0x5/0x2d0 [xfs] [] ? xfs_inode_ag_iterator+0xae/0xf0 [xfs] [] ? xfs_trans_free_dqinfo+0x50/0x50 [xfs] [] ? xfs_inode_ag_iterator+0xcf/0xf0 [xfs] [] xfs_qm_dqpurge_all+0x66/0xb0 [xfs] [] xfs_qm_scall_quotaoff+0x20a/0x5f0 [xfs] [] xfs_fs_set_xstate+0x136/0x180 [xfs] [] do_quotactl+0x53a/0x6b0 [] ? iput+0x5b/0x90 [] SyS_quotactl+0x167/0x1d0 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b It's fine if we turn user quota off at first, then turn off other kind of quotas if they are enabled since the group/project dquot refcount is decreased to zero once the user quota if off. Otherwise, those dquots refcount is non-zero due to the user dquot might refer to them as hint(s). Hence, above operation cause an infinite loop at xfs_qm_dquot_walk() while trying to purge dquot cache. This problem has been around since Linux 3.4, it was introduced by: [ b84a3a9675 xfs: remove the per-filesystem list of dquots ] Originally we will release the group dquot pointers because the user dquots maybe carrying around as a hint via xfs_qm_detach_gdquots(). However, with above change, there is no such work to be done before purging group/project dquot cache. In order to solve this problem, this patch introduces a special routine xfs_qm_dqpurge_hints(), and it would release the group/project dquot pointers the user dquots maybe carrying around as a hint, and then it will proceed to purge the user dquot cache if requested. Cc: stable@vger.kernel.org Signed-off-by: Jie Liu Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit df8052e7dae00bde6f21b40b6e3e1099770f3afc) --- fs/xfs/xfs_qm.c | 71 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 588e4909c58..dd88f0e27bd 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -134,8 +134,6 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { @@ -143,21 +141,6 @@ xfs_qm_dqpurge( return EAGAIN; } - /* - * If this quota has a hint attached, prepare for releasing it now. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - pdqp = dqp->q_pdquot; - if (pdqp) { - xfs_dqlock(pdqp); - dqp->q_pdquot = NULL; - } - dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); @@ -206,11 +189,47 @@ xfs_qm_dqpurge( XFS_STATS_DEC(xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); + return 0; +} + +/* + * Release the group or project dquot pointers the user dquots maybe carrying + * around as a hint, and proceed to purge the user dquot cache if requested. +*/ +STATIC int +xfs_qm_dqpurge_hints( + struct xfs_dquot *dqp, + void *data) +{ + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint flags = *((uint *)data); + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + return EAGAIN; + } + + /* If this quota has a hint attached, prepare for releasing it now */ + gdqp = dqp->q_gdquot; + if (gdqp) + dqp->q_gdquot = NULL; + + pdqp = dqp->q_pdquot; + if (pdqp) + dqp->q_pdquot = NULL; + + xfs_dqunlock(dqp); if (gdqp) - xfs_qm_dqput(gdqp); + xfs_qm_dqrele(gdqp); if (pdqp) - xfs_qm_dqput(pdqp); + xfs_qm_dqrele(pdqp); + + if (flags & XFS_QMOPT_UQUOTA) + return xfs_qm_dqpurge(dqp, NULL); + return 0; } @@ -222,8 +241,18 @@ xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) { - if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + /* + * We have to release group/project dquot hint(s) from the user dquot + * at first if they are there, otherwise we would run into an infinite + * loop while walking through radix tree to purge other type of dquots + * since their refcount is not zero if the user dquot refers to them + * as hint. + * + * Call the special xfs_qm_dqpurge_hints() will end up go through the + * general xfs_qm_dqpurge() against user dquot cache if requested. + */ + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags); + if (flags & XFS_QMOPT_GQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) -- cgit v1.2.3-70-g09d2 From 6e708bcf6583f663da9fe7bc5292cc62f0a8410d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Nov 2013 10:41:16 +1100 Subject: xfs: align initial file allocations correctly The function xfs_bmap_isaeof() is used to indicate that an allocation is occurring at or past the end of file, and as such should be aligned to the underlying storage geometry if possible. Commit 27a3f8f ("xfs: introduce xfs_bmap_last_extent") changed the behaviour of this function for empty files - it turned off allocation alignment for this case accidentally. Hence large initial allocations from direct IO are not getting correctly aligned to the underlying geometry, and that is cause write performance to drop in alignment sensitive configurations. Fix it by considering allocation into empty files as requiring aligned allocation again. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers (cherry picked from commit f9b395a8ef8f34d19cae2cde361e19c96e097fad) --- fs/xfs/xfs_bmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3ef11b22e75..8401f11f378 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1635,7 +1635,7 @@ xfs_bmap_last_extent( * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be * at, or past the EOF. */ STATIC int @@ -1650,9 +1650,14 @@ xfs_bmap_isaeof( bma->aeof = 0; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); - if (error || is_empty) + if (error) return error; + if (is_empty) { + bma->aeof = 1; + return 0; + } + /* * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. -- cgit v1.2.3-70-g09d2 From 83a0adc3f93aae4ab9c59113e3145c7bdb2b4a8c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Dec 2013 00:03:52 -0800 Subject: xfs: remove xfsbdstrat error The xfsbdstrat helper is a small but useless wrapper for xfs_buf_iorequest that handles the case of a shut down filesystem. Most of the users have private, uncached buffers that can just be freed in this case, but the complex error handling in xfs_bioerror_relse messes up the case when it's called without a locked buffer. Remove xfsbdstrat and opencode the error handling in the callers. All but one can simply return an error and don't need to deal with buffer state, and the one caller that cares about the buffer state could do with a major cleanup as well, but we'll defer that to later. Signed-off-by: Christoph Hellwig Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap_util.c | 14 ++++++++++++-- fs/xfs/xfs_buf.c | 27 ++++++--------------------- fs/xfs/xfs_buf.h | 5 ++--- fs/xfs/xfs_log_recover.c | 13 +++++++++++-- fs/xfs/xfs_trans_buf.c | 13 ++++++++++++- 5 files changed, 43 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5887e41c032..1394106ed22 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1187,7 +1187,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNWRITE(bp); XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, @@ -1200,7 +1205,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNDONE(bp); XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c7f0b77dcb0..9fa9c430461 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -698,7 +698,11 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - xfsbdstrat(target->bt_mount, bp); + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); xfs_buf_iowait(bp); return bp; } @@ -1089,7 +1093,7 @@ xfs_bioerror( * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ -STATIC int +int xfs_bioerror_relse( struct xfs_buf *bp) { @@ -1164,25 +1168,6 @@ xfs_bwrite( return error; } -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - return; - } - - xfs_buf_iorequest(bp); -} - STATIC void _xfs_buf_ioend( xfs_buf_t *bp, diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index e6568336101..7e41b08017f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -269,9 +269,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); - -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); - extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); @@ -282,6 +279,8 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) +extern int xfs_bioerror_relse(struct xfs_buf *); + static inline int xfs_buf_geterror(xfs_buf_t *bp) { return bp ? bp->b_error : ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b6b669df40f..eae16920655 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -193,7 +193,10 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) xfs_buf_ioerror_alert(bp, __func__); @@ -4397,7 +4400,13 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - xfsbdstrat(log->l_mp, bp); + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index c035d11b773..647b6f1d892 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -314,7 +314,18 @@ xfs_trans_read_buf_map( ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); bp->b_ops = ops; - xfsbdstrat(tp->t_mountp, bp); + + /* + * XXX(hch): clean up the error handling here to be less + * of a mess.. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + } else { + xfs_buf_iorequest(bp); + } + error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); -- cgit v1.2.3-70-g09d2 From 33177f05364c6cd13b06d0f3500dad07cf4647c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Dec 2013 16:34:36 +1100 Subject: xfs: swalloc doesn't align allocations properly When swalloc is specified as a mount option, allocations are supposed to be aligned to the stripe width rather than the stripe unit of the underlying filesystem. However, it does not do this. What the implementation does is round up the allocation size to a stripe width, hence ensuring that all allocations span a full stripe width. It does not, however, ensure that that allocation is aligned to a stripe width, and hence the allocations can span multiple underlying stripes and so still see RMW cycles for things like direct IO on MD RAID. So, if the swalloc mount option is set, change the allocation alignment in xfs_bmap_btalloc() to use the stripe width rather than the stripe unit. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8401f11f378..3b2c14b6f0f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3648,10 +3648,19 @@ xfs_bmap_btalloc( int isaligned; int tryagain; int error; + int stripe_align; ASSERT(ap->length); mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3660,6 +3669,8 @@ xfs_bmap_btalloc( ASSERT(!error); ASSERT(ap->length); } + + nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { @@ -3735,7 +3746,7 @@ xfs_bmap_btalloc( */ if (!ap->flist->xbf_low && ap->aeof) { if (!ap->offset) { - args.alignment = mp->m_dalign; + args.alignment = stripe_align; atype = args.type; isaligned = 1; /* @@ -3760,13 +3771,13 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= args.maxlen) - nextminlen = blen - mp->m_dalign; + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; else nextminlen = args.minlen; - if (nextminlen + mp->m_dalign > args.minlen + 1) + if (nextminlen + stripe_align > args.minlen + 1) args.minalignslop = - nextminlen + mp->m_dalign - + nextminlen + stripe_align - args.minlen - 1; else args.minalignslop = 0; @@ -3788,7 +3799,7 @@ xfs_bmap_btalloc( */ args.type = atype; args.fsbno = ap->blkno; - args.alignment = mp->m_dalign; + args.alignment = stripe_align; args.minlen = nextminlen; args.minalignslop = 0; isaligned = 1; -- cgit v1.2.3-70-g09d2 From ac8809f9ab01a73de1a47b5a37bd8dcca8712fb3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Dec 2013 16:34:38 +1100 Subject: xfs: abort metadata writeback on permanent errors If we are doing aysnc writeback of metadata, we can get write errors but have nobody to report them to. At the moment, we simply attempt to reissue the write from io completion in the hope that it's a transient error. When it's not a transient error, the buffer is stuck forever in this loop, and we cannot break out of it. Eventually, unmount will hang because the AIL cannot be emptied and everything goes downhill from them. To solve this problem, only retry the write IO once before aborting it. We don't throw the buffer away because some transient errors can last minutes (e.g. FC path failover) or even hours (thin provisioned devices that have run out of backing space) before they go away. Hence we really want to keep trying until we can't try any more. Because the buffer was not cleaned, however, it does not get removed from the AIL and hence the next pass across the AIL will start IO on it again. As such, we still get the "retry forever" semantics that we currently have, but we allow other access to the buffer in the mean time. Meanwhile the filesystem can continue to modify the buffer and relog it, so the IO errors won't hang the log or the filesystem. Now when we are pushing the AIL, we can see all these "permanent IO error" buffers and we can issue a warning about failures before we retry the IO. We can also catch these buffers when unmounting an issue a corruption warning, too. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 10 ++++++++-- fs/xfs/xfs_buf.h | 6 +++++- fs/xfs/xfs_buf_item.c | 21 +++++++++++++++++++-- 3 files changed, 32 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9fa9c430461..afe7645e4b2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1156,7 +1156,7 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); xfs_bdstrat_cb(bp); @@ -1501,6 +1501,12 @@ xfs_wait_buftarg( struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } xfs_buf_rele(bp); } if (loop++ != 0) @@ -1784,7 +1790,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE; if (!wait) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7e41b08017f..1cf21a4a9f2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -45,6 +45,7 @@ typedef enum { #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ @@ -80,6 +82,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" }, \ { _XBF_COMPOUND, "COMPOUND" } + /* * Internal state flags. */ @@ -300,7 +303,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ - XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a64f67ba25d..2227b9b050b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -496,6 +496,14 @@ xfs_buf_item_unpin( } } +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -524,6 +532,14 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -1096,8 +1112,9 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!XFS_BUF_ISSTALE(bp)) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); -- cgit v1.2.3-70-g09d2 From df36ac1bc2a166eef90785d584e4cfed6f52bd32 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 18 Dec 2013 15:17:10 -0800 Subject: pstore: Don't allow high traffic options on fragile devices Some pstore backing devices use on board flash as persistent storage. These have limited numbers of write cycles so it is a poor idea to use them from high frequency operations. Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- drivers/acpi/apei/erst.c | 1 + drivers/firmware/efi/efi-pstore.c | 1 + fs/pstore/platform.c | 7 +++++-- include/linux/pstore.h | 3 +++ 4 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 26311f23c82..cb1d557fc22 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -942,6 +942,7 @@ static int erst_clearer(enum pstore_type_id type, u64 id, int count, static struct pstore_info erst_info = { .owner = THIS_MODULE, .name = "erst", + .flags = PSTORE_FLAGS_FRAGILE, .open = erst_open_pstore, .close = erst_close_pstore, .read = erst_reader, diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 743fd426f21..4b9dc836dcf 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -356,6 +356,7 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, static struct pstore_info efi_pstore_info = { .owner = THIS_MODULE, .name = "efi", + .flags = PSTORE_FLAGS_FRAGILE, .open = efi_pstore_open, .close = efi_pstore_close, .read = efi_pstore_read, diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b8e93a40a5d..78c3c209778 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); kmsg_dump_register(&pstore_dumper); - pstore_register_console(); - pstore_register_ftrace(); + + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_register_console(); + pstore_register_ftrace(); + } if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + diff --git a/include/linux/pstore.h b/include/linux/pstore.h index abd437d0a8a..ece0c6bbfcc 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -51,6 +51,7 @@ struct pstore_info { char *buf; size_t bufsize; struct mutex read_mutex; /* serialize open/read/close */ + int flags; int (*open)(struct pstore_info *psi); int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, @@ -70,6 +71,8 @@ struct pstore_info { void *data; }; +#define PSTORE_FLAGS_FRAGILE 1 + #ifdef CONFIG_PSTORE extern int pstore_register(struct pstore_info *); extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason); -- cgit v1.2.3-70-g09d2 From 1881686f842065d2f92ec9c6424830ffc17d23b0 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sat, 21 Dec 2013 15:49:28 -0500 Subject: aio: fix kioctx leak introduced by "aio: Fix a trinity splat" e34ecee2ae791df674dfb466ce40692ca6218e43 reworked the percpu reference counting to correct a bug trinity found. Unfortunately, the change lead to kioctxes being leaked because there was no final reference count to put. Add that reference count back in to fix things. Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6efb7f6cb22..fd1c0baf15b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -652,7 +652,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) -- cgit v1.2.3-70-g09d2 From 8e321fefb0e60bae4e2a28d20fc4fa30758d27c6 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sat, 21 Dec 2013 17:56:08 -0500 Subject: aio/migratepages: make aio migrate pages sane The arbitrary restriction on page counts offered by the core migrate_page_move_mapping() code results in rather suspicious looking fiddling with page reference counts in the aio_migratepage() operation. To fix this, make migrate_page_move_mapping() take an extra_count parameter that allows aio to tell the code about its own reference count on the page being migrated. While cleaning up aio_migratepage(), make it validate that the old page being passed in is actually what aio_migratepage() expects to prevent misbehaviour in the case of races. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 52 +++++++++++++++++++++++++++++++++++++++++-------- include/linux/migrate.h | 3 ++- mm/migrate.c | 13 +++++++------ 3 files changed, 53 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index fd1c0baf15b..efa708b2905 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx) int i; for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); - put_page(ctx->ring_pages[i]); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); } put_aio_ring_file(ctx); @@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, unsigned long flags; int rc; + rc = 0; + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (ctx) { + pgoff_t idx; + spin_lock_irqsave(&ctx->completion_lock, flags); + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else + rc = -EINVAL; + spin_unlock(&mapping->private_lock); + + if (rc != 0) + return rc; + /* Writeback must be complete */ BUG_ON(PageWriteback(old)); - put_page(old); + get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { - get_page(old); + put_page(new); return rc; } - get_page(new); - /* We can potentially race against kioctx teardown here. Use the * address_space's private data lock to protect the mapping's * private_data. @@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) - ctx->ring_pages[idx] = new; + if (idx < (pgoff_t)ctx->nr_pages) { + /* And only do the move if things haven't changed */ + if (ctx->ring_pages[idx] == old) + ctx->ring_pages[idx] = new; + else + rc = -EAGAIN; + } else + rc = -EINVAL; spin_unlock_irqrestore(&ctx->completion_lock, flags); } else rc = -EBUSY; spin_unlock(&mapping->private_lock); + if (rc == MIGRATEPAGE_SUCCESS) + put_page(old); + else + put_page(new); + return rc; } #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index b7717d74da7..f015c059e15 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -55,7 +55,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode); + struct buffer_head *head, enum migrate_mode mode, + int extra_count); #else static inline void putback_lru_pages(struct list_head *l) {} diff --git a/mm/migrate.c b/mm/migrate.c index e9b71020133..9194375b230 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -317,14 +317,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, */ int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode) + struct buffer_head *head, enum migrate_mode mode, + int extra_count) { - int expected_count = 0; + int expected_count = 1 + extra_count; void **pslot; if (!mapping) { /* Anonymous page without mapping */ - if (page_count(page) != 1) + if (page_count(page) != expected_count) return -EAGAIN; return MIGRATEPAGE_SUCCESS; } @@ -334,7 +335,7 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + page_has_private(page); + expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -584,7 +585,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -611,7 +612,7 @@ int buffer_migrate_page(struct address_space *mapping, head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; -- cgit v1.2.3-70-g09d2 From 3dc9acb67600393249a795934ccdfc291a200e6b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Dec 2013 05:11:12 +0900 Subject: aio: clean up and fix aio_setup_ring page mapping Since commit 36bc08cc01709 ("fs/aio: Add support to aio ring pages migration") the aio ring setup code has used a special per-ring backing inode for the page allocations, rather than just using random anonymous pages. However, rather than remembering the pages as it allocated them, it would allocate the pages, insert them into the file mapping (dirty, so that they couldn't be free'd), and then forget about them. And then to look them up again, it would mmap the mapping, and then use "get_user_pages()" to get back an array of the pages we just created. Now, not only is that incredibly inefficient, it also leaked all the pages if the mmap failed (which could happen due to excessive number of mappings, for example). So clean it all up, making it much more straightforward. Also remove some left-overs of the previous (broken) mm_populate() usage that was removed in commit d6c355c7dabc ("aio: fix race in ring buffer page lookup introduced by page migration support") but left the pointless and now misleading MAP_POPULATE flag around. Tested-and-acked-by: Benjamin LaHaise Signed-off-by: Linus Torvalds --- fs/aio.c | 58 +++++++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6efb7f6cb22..643db8fc43c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -326,7 +326,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; - unsigned long size, populate; + unsigned long size, unused; int nr_pages; int i; struct file *file; @@ -347,6 +347,20 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); + + ctx->ring_pages = ctx->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); + return -ENOMEM; + } + } + for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_inode->i_mapping, @@ -358,19 +372,14 @@ static int aio_setup_ring(struct kioctx *ctx) SetPageUptodate(page); SetPageDirty(page); unlock_page(page); + + ctx->ring_pages[i] = page; } - ctx->aio_ring_file = file; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + ctx->nr_pages = i; - ctx->ring_pages = ctx->internal_pages; - if (nr_pages > AIO_RING_PAGES) { - ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!ctx->ring_pages) { - put_aio_ring_file(ctx); - return -ENOMEM; - } + if (unlikely(i != nr_pages)) { + aio_free_ring(ctx); + return -EAGAIN; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -379,9 +388,9 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&mm->mmap_sem); ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, 0, &populate); + MAP_SHARED, 0, &unused); + up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { - up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; @@ -389,27 +398,6 @@ static int aio_setup_ring(struct kioctx *ctx) pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); - /* We must do this while still holding mmap_sem for write, as we - * need to be protected against userspace attempting to mremap() - * or munmap() the ring buffer. - */ - ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, - 1, 0, ctx->ring_pages, NULL); - - /* Dropping the reference here is safe as the page cache will hold - * onto the pages for us. It is also required so that page migration - * can unmap the pages and get the right reference count. - */ - for (i = 0; i < ctx->nr_pages; i++) - put_page(ctx->ring_pages[i]); - - up_write(&mm->mmap_sem); - - if (unlikely(ctx->nr_pages != nr_pages)) { - aio_free_ring(ctx); - return -EAGAIN; - } - ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ -- cgit v1.2.3-70-g09d2