From 718360c59f34b80d9878429300c1c688f7c2031d Mon Sep 17 00:00:00 2001 From: Noah Massey Date: Thu, 30 Jan 2014 21:31:12 -0500 Subject: nfs: fix setting of ACLs on file creation. nfs3_get_acl() tries to skip posix equivalent ACLs, but misinterprets the return value of posix_acl_equiv_mode(). Fix it. This is a regression introduced by "nfs: use generic posix ACL infrastructure for v3 Posix ACLs" CC: Christoph Hellwig CC: linux-nfs@vger.kernel.org CC: linux-fsdevel@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9a5ca03fa53..0851f852568 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -80,7 +80,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) } if (res.acl_access != NULL) { - if (posix_acl_equiv_mode(res.acl_access, NULL) || + if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || res.acl_access->a_count == 0) { posix_acl_release(res.acl_access); res.acl_access = NULL; -- cgit v1.2.3-70-g09d2 From 17ead6c85c3d0ef57a14d1373f1f1cee2ce60ea8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2014 14:53:23 -0500 Subject: NFSv4: Fix memory corruption in nfs4_proc_open_confirm nfs41_wake_and_assign_slot() relies on the task->tk_msg.rpc_argp and task->tk_msg.rpc_resp always pointing to the session sequence arguments. nfs4_proc_open_confirm tries to pull a fast one by reusing the open sequence structure, thus causing corruption of the NFSv4 slot table. Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++---- include/linux/nfs_xdr.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 42da6af7758..2da6a698b8f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1620,15 +1620,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task); + nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, + &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_sequence_done(task, &data->o_res.seq_res); + nfs40_sequence_done(task, &data->c_res.seq_res); data->rpc_status = task->tk_status; if (data->rpc_status == 0) { @@ -1686,7 +1686,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; - nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3ccfcecf899..b2fb167b2e6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -379,12 +379,14 @@ struct nfs_openres { * Arguments to the open_confirm call. */ struct nfs_open_confirmargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; nfs4_stateid * stateid; struct nfs_seqid * seqid; }; struct nfs_open_confirmres { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_seqid * seqid; }; -- cgit v1.2.3-70-g09d2 From 20b9a9024540a775395d5d1f41eec0ec6ec41f9b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2014 13:47:06 -0500 Subject: NFSv4.1: nfs4_destroy_session must call rpc_destroy_waitqueue There may still be timers active on the session waitqueues. Make sure that we kill them before freeing the memory. Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 2 +- fs/nfs/nfs4session.c | 25 ++++++++++++++++++++----- fs/nfs/nfs4session.h | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index dbb3e1f30c6..860ad26a559 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -170,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp) void nfs40_shutdown_client(struct nfs_client *clp) { if (clp->cl_slot_tbl) { - nfs4_release_slot_table(clp->cl_slot_tbl); + nfs4_shutdown_slot_table(clp->cl_slot_tbl); kfree(clp->cl_slot_tbl); } } diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index cf883c7ae05..e799dc3c3b1 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -231,14 +231,23 @@ out: return ret; } +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_shrink_slot_table(tbl, 0); +} + /** - * nfs4_release_slot_table - release resources attached to a slot table + * nfs4_shutdown_slot_table - release resources attached to a slot table * @tbl: slot table to shut down * */ -void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl) { - nfs4_shrink_slot_table(tbl, 0); + nfs4_release_slot_table(tbl); + rpc_destroy_wait_queue(&tbl->slot_tbl_waitq); } /** @@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, spin_unlock(&tbl->slot_tbl_lock); } -static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +static void nfs4_release_session_slot_tables(struct nfs4_session *session) { nfs4_release_slot_table(&session->fc_slot_table); nfs4_release_slot_table(&session->bc_slot_table); @@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses) if (status && tbl->slots == NULL) /* Fore and back channel share a connection so get * both slot tables or neither */ - nfs4_destroy_session_slot_tables(ses); + nfs4_release_session_slot_tables(ses); return status; } @@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) return session; } +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ + nfs4_shutdown_slot_table(&session->fc_slot_table); + nfs4_shutdown_slot_table(&session->bc_slot_table); +} + void nfs4_destroy_session(struct nfs4_session *session) { struct rpc_xprt *xprt; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 23230610065..b34ada9bc6a 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -74,7 +74,7 @@ enum nfs4_session_state { extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, const char *queue); -extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); -- cgit v1.2.3-70-g09d2 From 8101c8dbf6243ba517aab58d69bf1bc37d8b7b9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Jan 2014 16:05:30 -0500 Subject: Btrfs: disable snapshot aware defrag for now It's just broken and it's taking a lot of effort to fix it, so for now just disable it so people can defrag in peace. Thanks, Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb74a536add..1af34d0c744 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2629,7 +2629,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DEFRAG, 1, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (last_snapshot >= BTRFS_I(inode)->generation) + if (0 && last_snapshot >= BTRFS_I(inode)->generation) /* the inode is shared */ new = record_old_file_extents(inode, ordered_extent); -- cgit v1.2.3-70-g09d2 From 0b947aff1599afbbd2ec07ada87b05af0f94cf10 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Wed, 29 Jan 2014 21:06:04 +0000 Subject: Btrfs: use btrfs_crc32c everywhere instead of libcrc32c After the commit titled "Btrfs: fix btrfs boot when compiled as built-in", LIBCRC32C requirement was removed from btrfs' Kconfig. This made it not possible to build a kernel with btrfs enabled (either as module or built-in) if libcrc32c is not enabled as well. So just replace all uses of libcrc32c with the equivalent function in btrfs hash.h - btrfs_crc32c. Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 4 ++-- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/send.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 160fb509d72..39bfd56a1f2 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -92,11 +92,11 @@ #include #include #include -#include #include #include #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "extent_io.h" #include "volumes.h" @@ -1823,7 +1823,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, size_t sublen = i ? PAGE_CACHE_SIZE : (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crc = btrfs_crc32c(crc, data, sublen); } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7619147da38..3903bd3f8d2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -244,7 +244,7 @@ out: u32 btrfs_csum_data(char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 730dce39585..cf9107a6420 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,12 +24,12 @@ #include #include #include -#include #include #include #include "send.h" #include "backref.h" +#include "hash.h" #include "locking.h" #include "disk-io.h" #include "btrfs_inode.h" @@ -620,7 +620,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, -- cgit v1.2.3-70-g09d2 From 60efa5eb2e886852a0d5f9e1ffa7c896a1099da8 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Sat, 1 Feb 2014 21:27:56 +0000 Subject: Btrfs: use late_initcall instead of module_init It seems that when init_btrfs_fs() is called, crc32c/crc32c-intel might not always be already initialized, which results in the call to crypto_alloc_shash() returning -ENOENT, as experienced by Ahmet who reported this. Therefore make sure init_btrfs_fs() is called after crc32c is initialized (which is at initialization level 6, module_init), by using late_initcall (which is at initialization level 7) instead of module_init for btrfs. Reported-and-Tested-by: Ahmet Inan Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c02f6335689..97cc2419855 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1996,7 +1996,7 @@ static void __exit exit_btrfs_fs(void) btrfs_hash_exit(); } -module_init(init_btrfs_fs) +late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From d4c42fb493e018e9240810bb6dc5334ae0505145 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 2 Feb 2014 14:41:42 -0500 Subject: NFSv3: Remove unused function nfs3_proc_set_default_acl Cc: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 0851f852568..9271a6bb9a4 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -233,25 +233,6 @@ fail: return PTR_ERR(alloc); } -int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - umode_t mode) -{ - struct posix_acl *default_acl, *acl; - int error; - - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) - return (error == -EOPNOTSUPP) ? 0 : error; - - error = nfs3_proc_setacls(inode, acl, default_acl); - - if (acl) - posix_acl_release(acl); - if (default_acl) - posix_acl_release(default_acl); - return error; -} - const struct xattr_handler *nfs3_xattr_handlers[] = { &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, -- cgit v1.2.3-70-g09d2 From 8f493b9cfcd8941c6b27d6ce8e3b4a78c094b3c1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 2 Feb 2014 14:36:42 -0500 Subject: NFSv3: Fix return value of nfs3_proc_setacls nfs3_proc_setacls is used internally by the NFSv3 create operations to set the acl after the file has been created. If the operation fails because the server doesn't support acls, then it must return '0', not -EOPNOTSUPP. Reported-by: Russell King Link: http://lkml.kernel.org/r/20140201010328.GI15937@n2100.arm.linux.org.uk Cc: Christoph Hellwig Tested-by: Takashi Iwai Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9271a6bb9a4..871d6eda8db 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -113,7 +113,7 @@ getout: return ERR_PTR(status); } -int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); @@ -198,6 +198,15 @@ out: return status; } +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + int ret; + ret = __nfs3_proc_setacls(inode, acl, dfacl); + return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct posix_acl *alloc = NULL, *dfacl = NULL; @@ -225,7 +234,7 @@ int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (IS_ERR(alloc)) goto fail; } - status = nfs3_proc_setacls(inode, acl, dfacl); + status = __nfs3_proc_setacls(inode, acl, dfacl); posix_acl_release(alloc); return status; -- cgit v1.2.3-70-g09d2 From 789b663ae3d427ea9c50505339a13276e7228c9d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 31 Jan 2014 14:25:19 -0500 Subject: fs: get_acl() must be allowed to return EOPNOTSUPP posix_acl_xattr_get requires get_acl() to return EOPNOTSUPP if the filesystem cannot support acls. This is needed for NFS, which can't know whether or not the server supports acls until it tries to get/set one. This patch converts posix_acl_chmod and posix_acl_create to deal with EOPNOTSUPP return values from get_acl(). Reported-by: Russell King Link: http://lkml.kernel.org/r/20140130140834.GW15937@n2100.arm.linux.org.uk Cc: Al Viro viro@zeniv.linux.org.uk> Reviewed-by: Christoph Hellwig Tested-by: Takashi Iwai Signed-off-by: Trond Myklebust --- fs/posix_acl.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 38bae5a0ea2..11c54fd51e1 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -521,8 +521,11 @@ posix_acl_chmod(struct inode *inode, umode_t mode) return -EOPNOTSUPP; acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) + if (IS_ERR_OR_NULL(acl)) { + if (acl == ERR_PTR(-EOPNOTSUPP)) + return 0; return PTR_ERR(acl); + } ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) @@ -544,14 +547,15 @@ posix_acl_create(struct inode *dir, umode_t *mode, goto no_acl; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; return PTR_ERR(p); - - if (!p) { - *mode &= ~current_umask(); - goto no_acl; } + if (!p) + goto apply_umask; + *acl = posix_acl_clone(p, GFP_NOFS); if (!*acl) return -ENOMEM; @@ -575,6 +579,8 @@ posix_acl_create(struct inode *dir, umode_t *mode, } return 0; +apply_umask: + *mode &= ~current_umask(); no_acl: *default_acl = NULL; *acl = NULL; -- cgit v1.2.3-70-g09d2 From da9846ae15186d491d6e21ebbb5051e1d3c7f652 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 29 Jan 2014 12:04:03 -0500 Subject: kernfs: make kernfs_deactivate() honor KERNFS_LOCKDEP flag kernfs_deactivate() forgot to check whether KERNFS_LOCKDEP is set before performing lockdep annotations and ends up feeding uninitialized lockdep_map to lockdep triggering warning like the following on USB stick hotunplug. usb 1-2: USB disconnect, device number 2 INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 1 PID: 62 Comm: khubd Not tainted 3.13.0-work+ #82 Hardware name: empty empty/S3992, BIOS 080011 10/26/2007 ffff880065ca7f60 ffff88013a4ffa08 ffffffff81cfb6bd 0000000000000002 ffff88013a4ffac8 ffffffff810f8530 ffff88013a4fc710 0000000000000002 ffff880100000000 ffffffff82a3db50 0000000000000001 ffff88013a4fc710 Call Trace: [] dump_stack+0x4e/0x7a [] __lock_acquire+0x1910/0x1e70 [] lock_acquire+0x9a/0x1d0 [] kernfs_deactivate+0xee/0x130 [] kernfs_addrm_finish+0x38/0x60 [] kernfs_remove_by_name_ns+0x51/0xa0 [] remove_files.isra.1+0x41/0x80 [] sysfs_remove_group+0x47/0xa0 [] sysfs_remove_groups+0x33/0x50 [] device_remove_attrs+0x4d/0x80 [] device_del+0x12e/0x1d0 [] usb_disconnect+0x122/0x1a0 [] hub_thread+0x3c5/0x1290 [] kthread+0xed/0x110 [] ret_from_fork+0x7c/0xb0 Fix it by making kernfs_deactivate() perform lockdep annotations only if KERNFS_LOCKDEP is set. Signed-off-by: Tejun Heo Reported-by: Fabio Estevam Reported-by: Alan Stern Reported-by: Jiri Kosina Reported-by: Dave Jones Tested-by: Fabio Estevam Tested-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5104cf5d25c..bd6e18be6e1 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -187,19 +187,23 @@ static void kernfs_deactivate(struct kernfs_node *kn) kn->u.completion = (void *)&wait; - rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + if (kn->flags & KERNFS_LOCKDEP) + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see * the updated kn->u.completion. */ v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); if (v != KN_DEACTIVATED_BIAS) { - lock_contended(&kn->dep_map, _RET_IP_); + if (kn->flags & KERNFS_LOCKDEP) + lock_contended(&kn->dep_map, _RET_IP_); wait_for_completion(&wait); } - lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + if (kn->flags & KERNFS_LOCKDEP) { + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); + } } /** -- cgit v1.2.3-70-g09d2 From c4ad8f98bef77c7356aa6a9ad9188a6acc6b849d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Feb 2014 12:54:53 -0800 Subject: execve: use 'struct filename *' for executable name passing This changes 'do_execve()' to get the executable name as a 'struct filename', and to free it when it is done. This is what the normal users want, and it simplifies and streamlines their error handling. The controlled lifetime of the executable name also fixes a use-after-free problem with the trace_sched_process_exec tracepoint: the lifetime of the passed-in string for kernel users was not at all obvious, and the user-mode helper code used UMH_WAIT_EXEC to serialize the pathname allocation lifetime with the execve() having finished, which in turn meant that the trace point that happened after mm_release() of the old process VM ended up using already free'd memory. To solve the kernel string lifetime issue, this simply introduces "getname_kernel()" that works like the normal user-space getname() function, except with the source coming from kernel memory. As Oleg points out, this also means that we could drop the tcomm[] array from 'struct linux_binprm', since the pathname lifetime now covers setup_new_exec(). That would be a separate cleanup. Reported-by: Igor Zhbanov Tested-by: Steven Rostedt Cc: Oleg Nesterov Cc: Al Viro Signed-off-by: Linus Torvalds --- arch/parisc/hpux/fs.c | 15 +-------------- fs/exec.c | 45 +++++++++++++++++++++------------------------ fs/namei.c | 30 ++++++++++++++++++++++++++++++ include/linux/binfmts.h | 1 - include/linux/fs.h | 1 + include/linux/sched.h | 3 ++- init/main.c | 2 +- kernel/auditsc.c | 2 +- kernel/kmod.c | 2 +- 9 files changed, 58 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index 88d0962de65..2bedafea3d9 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -33,22 +33,9 @@ int hpux_execve(struct pt_regs *regs) { - int error; - struct filename *filename; - - filename = getname((const char __user *) regs->gr[26]); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - - error = do_execve(filename->name, + return do_execve(getname((const char __user *) regs->gr[26]), (const char __user *const __user *) regs->gr[25], (const char __user *const __user *) regs->gr[24]); - - putname(filename); - -out: - return error; } struct hpux_dirent { diff --git a/fs/exec.c b/fs/exec.c index e1529b4c79b..3d78fccdd72 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,11 +748,10 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -struct file *open_exec(const char *name) +static struct file *do_open_exec(struct filename *name) { struct file *file; int err; - struct filename tmp = { .name = name }; static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, @@ -760,7 +759,7 @@ struct file *open_exec(const char *name) .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -784,6 +783,12 @@ exit: fput(file); return ERR_PTR(err); } + +struct file *open_exec(const char *name) +{ + struct filename tmp = { .name = name }; + return do_open_exec(&tmp); +} EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, loff_t offset, @@ -1162,7 +1167,7 @@ int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } -void free_bprm(struct linux_binprm *bprm) +static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { @@ -1432,7 +1437,7 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(const char *filename, +static int do_execve_common(struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp) { @@ -1441,6 +1446,9 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; int retval; + if (IS_ERR(filename)) + return PTR_ERR(filename); + /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs @@ -1473,7 +1481,7 @@ static int do_execve_common(const char *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = open_exec(filename); + file = do_open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1481,8 +1489,7 @@ static int do_execve_common(const char *filename, sched_exec(); bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; + bprm->filename = bprm->interp = filename->name; retval = bprm_mm_init(bprm); if (retval) @@ -1523,6 +1530,7 @@ static int do_execve_common(const char *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1544,10 +1552,11 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: + putname(filename); return retval; } -int do_execve(const char *filename, +int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { @@ -1557,7 +1566,7 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -static int compat_do_execve(const char *filename, +static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { @@ -1607,25 +1616,13 @@ SYSCALL_DEFINE3(execve, const char __user *const __user *, argv, const char __user *const __user *, envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp); - putname(path); - } - return error; + return do_execve(getname(filename), argv, envp); } #ifdef CONFIG_COMPAT asmlinkage long compat_sys_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp); - putname(path); - } - return error; + return compat_do_execve(getname(filename), argv, envp); } #endif diff --git a/fs/namei.c b/fs/namei.c index d580df2e680..385f7817bfc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -196,6 +196,7 @@ recopy: goto error; result->uptr = filename; + result->aname = NULL; audit_getname(result); return result; @@ -210,6 +211,35 @@ getname(const char __user * filename) return getname_flags(filename, 0, NULL); } +/* + * The "getname_kernel()" interface doesn't do pathnames longer + * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. + */ +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + char *kname; + int len; + + len = strlen(filename); + if (len >= EMBEDDED_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + kname = (char *)result + sizeof(*result); + result->name = kname; + result->uptr = NULL; + result->aname = NULL; + result->separate = false; + + strlcpy(kname, filename, EMBEDDED_NAME_MAX); + return result; +} + #ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index fd8bf3219ef..b4a745d7d9a 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -115,7 +115,6 @@ extern int copy_strings_kernel(int argc, const char *const *argv, extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); -extern void free_bprm(struct linux_binprm *); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 09f553c5981..d79678c188a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2079,6 +2079,7 @@ extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname(const char __user *); +extern struct filename *getname_kernel(const char *); enum { FILE_CREATED = 1, diff --git a/include/linux/sched.h b/include/linux/sched.h index 68a0e84463a..a781dec1cd0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -128,6 +128,7 @@ struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; +struct filename; /* * List of flags we want to share for kernel threads, @@ -2311,7 +2312,7 @@ extern void do_group_exit(int); extern int allow_signal(int); extern int disallow_signal(int); -extern int do_execve(const char *, +extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); diff --git a/init/main.c b/init/main.c index 2fd9cef70ee..eb03090cdce 100644 --- a/init/main.c +++ b/init/main.c @@ -812,7 +812,7 @@ void __init load_default_modules(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - return do_execve(init_filename, + return do_execve(getname_kernel(init_filename), (const char __user *const __user *)argv_init, (const char __user *const __user *)envp_init); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 10176cd5956..7aef2f4b6c6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1719,7 +1719,7 @@ void audit_putname(struct filename *name) struct audit_context *context = current->audit_context; BUG_ON(!context); - if (!context->in_syscall) { + if (!name->aname || !context->in_syscall) { #if AUDIT_DEBUG == 2 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); diff --git a/kernel/kmod.c b/kernel/kmod.c index b086006c59e..6b375af4958 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = do_execve(sub_info->path, + retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); if (!retval) -- cgit v1.2.3-70-g09d2 From fb951eb5e167de9f07973ce0dfff674a2019bfab Mon Sep 17 00:00:00 2001 From: Zongxun Wang Date: Thu, 6 Feb 2014 12:04:20 -0800 Subject: ocfs2: free allocated clusters if error occurs after ocfs2_claim_clusters Even if using the same jbd2 handle, we cannot rollback a transaction. So once some error occurs after successfully allocating clusters, the allocated clusters will never be used and it means they are lost. For example, call ocfs2_claim_clusters successfully when expanding a file, but failed in ocfs2_insert_extent. So we need free the allocated clusters if they are not used indeed. Signed-off-by: Zongxun Wang Signed-off-by: Joseph Qi Acked-by: Joel Becker Cc: Mark Fasheh Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 38 +++++++++++++++++++++++++++++++++++--- fs/ocfs2/localalloc.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/localalloc.h | 6 ++++++ 3 files changed, 83 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 8750ae1b863..aada5801567 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, enum ocfs2_alloc_restarted *reason_ret) { int status = 0, err = 0; + int need_free = 0; int free_extents; enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; @@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, reason = RESTART_TRANS; } +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + leave: if (reason_ret) *reason_ret = reason; @@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { int ret, i, has_data, num_pages = 0; + int need_free = 0; + u32 bit_off, num; handle_t *handle; u64 uninitialized_var(block); struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - u32 bit_off, num; unsigned int page_end; u64 phys; @@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_read_inline_data(inode, pages[0], di_bh); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6938,6 +6958,18 @@ out_commit: dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + ocfs2_commit_trans(osb, handle); out_unlock: diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index cd5496b7a0a..04401345562 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -781,6 +781,48 @@ bail: return status; } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits) +{ + int status, start; + u32 clear_bits; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + bitmap = la->la_bitmap; + start = bit_off - le32_to_cpu(la->la_bm_off); + clear_bits = num_bits; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while (clear_bits--) + ocfs2_clear_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + return status; +} + static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { u32 count; diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b586446..44a7d1fb2de 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits); + void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters); void ocfs2_la_enable_worker(struct work_struct *work); -- cgit v1.2.3-70-g09d2 From 227d53b397a32a7614667b3ecaf1d89902fb6c12 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 6 Feb 2014 12:04:28 -0800 Subject: mm: __set_page_dirty uses spin_lock_irqsave instead of spin_lock_irq To use spin_{un}lock_irq is dangerous if caller disabled interrupt. During aio buffer migration, we have a possibility to see the following call stack. aio_migratepage [disable interrupt] migrate_page_copy clear_page_dirty_for_io set_page_dirty __set_page_dirty_buffers __set_page_dirty spin_lock_irq This mean, current aio migration is a deadlockable. spin_lock_irqsave is a safer alternative and we should use it. Signed-off-by: KOSAKI Motohiro Reported-by: David Rientjes rientjes@google.com> Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 651dba10b9c..27265a8b43c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } -- cgit v1.2.3-70-g09d2 From d979f3b0a1f0b5499ab85e68cdf02b56852918b6 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 1 Feb 2014 23:27:18 -0600 Subject: Add protocol specific operation for CIFS xattrs Changeset 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 added protocol operations for get/setxattr to avoid calling cifs operations on smb2/smb3 mounts for xattr operations and this changeset adds the calls to cifs specific protocol operations for xattrs (in order to reenable cifs support for xattrs which was temporarily disabled by the previous changeset. We do not have SMB2/SMB3 worker function for setting xattrs yet so this only enables it for cifs. CCing stable since without these two small changsets (its small coreq 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 is also needed) calling getfattr/setfattr on smb2/smb3 mounts causes problems. Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar CC: Stable --- fs/cifs/inode.c | 13 +++++++++---- fs/cifs/smb1ops.c | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9cb9679d735..be58b8fcdb3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -527,10 +527,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS", - ea_value, 4 /* size of buf */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tcon->ses->server->ops->query_all_EAs == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, + "SETFILEBITS", ea_value, 4 /* size of buf */, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 9ac5bfc9cc5..3e4ff79e303 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1067,6 +1067,10 @@ struct smb_version_operations smb1_operations = { .query_mf_symlink = cifs_query_mf_symlink, .create_mf_symlink = cifs_create_mf_symlink, .is_read_op = cifs_is_read_op, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = CIFSSMBQAllEAs, + .set_EA = CIFSSMBSetEA, +#endif /* CIFS_XATTR */ }; struct smb_version_values smb1_values = { -- cgit v1.2.3-70-g09d2 From 83e3bc23ef9ce7c03b7b4e5d3d790246ea59db3e Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 2 Feb 2014 23:31:47 -0600 Subject: retrieving CIFS ACLs when mounted with SMB2 fails dropping session The get/set ACL xattr support for CIFS ACLs attempts to send old cifs dialect protocol requests even when mounted with SMB2 or later dialects. Sending cifs requests on an smb2 session causes problems - the server drops the session due to the illegal request. This patch makes CIFS ACL operations protocol specific to fix that. Attempting to query/set CIFS ACLs for SMB2 will now return EOPNOTSUPP (until we add worker routines for sending query ACL requests via SMB2) instead of sending invalid (cifs) requests. A separate followon patch will be needed to fix cifs_acl_to_fattr (which takes a cifs specific u16 fid so can't be abstracted to work with SMB2 until that is changed) and will be needed to fix mount problems when "cifsacl" is specified on mount with e.g. vers=2.1 Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar CC: Stable --- fs/cifs/cifsacl.c | 28 ++++++++++++++++++++++++---- fs/cifs/cifsglob.h | 4 ++++ fs/cifs/smb1ops.c | 4 ++++ fs/cifs/xattr.c | 15 +++++++++++---- 4 files changed, 43 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 8f9b4f710d4..c819b0bd491 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1043,15 +1043,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); + + if (tcon->ses->server->ops->get_acl == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); - goto out; + cifs_put_tlink(tlink); + return rc; } /* @@ -1064,6 +1079,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); + cifs_put_tlink(tlink); return -ENOMEM; } @@ -1072,14 +1088,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); + if (tcon->ses->server->ops->set_acl == NULL) + rc = -EOPNOTSUPP; + if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, + path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } + cifs_put_tlink(tlink); kfree(pnntsd); kfree(pntsd); -out: return rc; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a245d1809ed..615e35ab7ef 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -395,6 +395,10 @@ struct smb_version_operations { int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, const struct nls_table *, int); + struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, + const char *, u32 *); + int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, + int); }; struct smb_version_values { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3e4ff79e303..bfd66d84831 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1071,6 +1071,10 @@ struct smb_version_operations smb1_operations = { .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, #endif /* CIFS_XATTR */ +#ifdef CONFIG_CIFS_ACL + .get_acl = get_cifs_acl, + .set_acl = set_cifs_acl, +#endif /* CIFS_ACL */ }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 95c43bb2033..5ac836a86b1 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -176,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = -ENOMEM; } else { memcpy(pacl, ea_value, value_size); - rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path, CIFS_ACL_DACL); + if (pTcon->ses->server->ops->set_acl) + rc = pTcon->ses->server->ops->set_acl(pacl, + value_size, direntry->d_inode, + full_path, CIFS_ACL_DACL); + else + rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -323,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, u32 acllen; struct cifs_ntsd *pacl; - pacl = get_cifs_acl(cifs_sb, direntry->d_inode, - full_path, &acllen); + if (pTcon->ses->server->ops->get_acl == NULL) + goto get_ea_exit; /* rc already EOPNOTSUPP */ + + pacl = pTcon->ses->server->ops->get_acl(cifs_sb, + direntry->d_inode, full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", -- cgit v1.2.3-70-g09d2 From 087787959ce851d7bbb19f10f6e9241b7f85a3ca Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 6 Feb 2014 15:14:13 -0500 Subject: block: Fix nr_vecs for inline integrity vectors Commit 9f060e2231ca changed the way we handle allocations for the integrity vectors. When the vectors are inline there is no associated slab and consequently bvec_nr_vecs() returns 0. Ensure that we check against BIP_INLINE_VECS in that case. Reported-by: David Milburn Tested-by: David Milburn Cc: stable@vger.kernel.org # v3.10+ Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 0bad24ddc2e..bcb4a4d533a 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio) } EXPORT_SYMBOL(bio_integrity_free); +static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) +{ + if (bip->bip_slab == BIO_POOL_NONE) + return BIP_INLINE_VECS; + + return bvec_nr_vecs(bip->bip_slab); +} + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update @@ -129,7 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv; - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { + if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } -- cgit v1.2.3-70-g09d2 From 26c8f0d601f5d4c0d9f4bc8c5151539aae5dc26a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 7 Feb 2014 11:04:04 -0500 Subject: cifs: use a flexarray in cifs_writedata The cifs_writedata code uses a single element trailing array, which just adds unneeded complexity. Use a flexarray instead. Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 +- fs/cifs/cifssmb.c | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 615e35ab7ef..d6a031ed391 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1068,7 +1068,7 @@ struct cifs_writedata { unsigned int pagesz; unsigned int tailsz; unsigned int nr_pages; - struct page *pages[1]; + struct page *pages[]; }; /* diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4d881c35eec..0cd742ccb01 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1962,15 +1962,9 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { struct cifs_writedata *wdata; - /* this would overflow */ - if (nr_pages == 0) { - cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__); - return NULL; - } - /* writedata + number of page pointers */ wdata = kzalloc(sizeof(*wdata) + - sizeof(struct page *) * (nr_pages - 1), GFP_NOFS); + sizeof(struct page *) * nr_pages, GFP_NOFS); if (wdata != NULL) { kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); -- cgit v1.2.3-70-g09d2 From 4a5c80d7b5615be8098f9d5da97d166afc318abc Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 7 Feb 2014 20:45:12 -0600 Subject: [CIFS] clean up page array when uncached write send fails In the event that a send fails in an uncached write, or we end up needing to reissue it (-EAGAIN case), we'll kfree the wdata but the pages currently leak. Fix this by adding a new kref release routine for uncached writedata that releases the pages, and have the uncached codepaths use that. [original patch by Jeff modified to fix minor formatting problems] Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 3 ++- fs/cifs/cifsproto.h | 3 ++- fs/cifs/cifssmb.c | 7 ++++--- fs/cifs/file.c | 31 ++++++++++++++++++++----------- fs/cifs/smb2pdu.c | 5 +++-- fs/cifs/smb2proto.h | 3 ++- 6 files changed, 33 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d6a031ed391..86dc28c7aa5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -323,7 +323,8 @@ struct smb_version_operations { /* async read from the server */ int (*async_readv)(struct cifs_readdata *); /* async write to the server */ - int (*async_writev)(struct cifs_writedata *); + int (*async_writev)(struct cifs_writedata *, + void (*release)(struct kref *)); /* sync read from the server */ int (*sync_read)(const unsigned int, struct cifsFileInfo *, struct cifs_io_parms *, unsigned int *, char **, diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 79e6e9a93a8..d00e09dfc45 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -488,7 +488,8 @@ void cifs_readdata_release(struct kref *refcount); int cifs_async_readv(struct cifs_readdata *rdata); int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); -int cifs_async_writev(struct cifs_writedata *wdata); +int cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0cd742ccb01..f3264bd7a83 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1910,7 +1910,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) do { server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, cifs_writedata_release); } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { @@ -2025,7 +2025,8 @@ cifs_writev_callback(struct mid_q_entry *mid) /* cifs_async_writev - send an async write, and set up mid to handle result */ int -cifs_async_writev(struct cifs_writedata *wdata) +cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) { int rc = -EACCES; WRITE_REQ *smb = NULL; @@ -2099,7 +2100,7 @@ cifs_async_writev(struct cifs_writedata *wdata) if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); else - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, release); async_writev_out: cifs_small_buf_release(smb); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 853d6d1cc82..a301edbdad4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2043,7 +2043,8 @@ retry: } wdata->pid = wdata->cfile->pid; server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, + cifs_writedata_release); } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); for (i = 0; i < nr_pages; ++i) @@ -2331,9 +2332,20 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) } static void -cifs_uncached_writev_complete(struct work_struct *work) +cifs_uncached_writedata_release(struct kref *refcount) { int i; + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->pages[i]); + cifs_writedata_release(refcount); +} + +static void +cifs_uncached_writev_complete(struct work_struct *work) +{ struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); struct inode *inode = wdata->cfile->dentry->d_inode; @@ -2347,12 +2359,7 @@ cifs_uncached_writev_complete(struct work_struct *work) complete(&wdata->done); - if (wdata->result != -EAGAIN) { - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); - } - - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); } /* attempt to send write to server, retry on any -EAGAIN errors */ @@ -2370,7 +2377,8 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata) if (rc != 0) continue; } - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); } while (rc == -EAGAIN); return rc; @@ -2454,7 +2462,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); rc = cifs_uncached_retry_writev(wdata); if (rc) { - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); break; } @@ -2496,7 +2505,7 @@ restart_loop: } } list_del_init(&wdata->list); - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); } if (total_written > 0) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2013234b73a..a3f7a9c3cc6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1890,7 +1890,8 @@ smb2_writev_callback(struct mid_q_entry *mid) /* smb2_async_writev - send an async write, and set up mid to handle result */ int -smb2_async_writev(struct cifs_writedata *wdata) +smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) { int rc = -EACCES; struct smb2_write_req *req = NULL; @@ -1938,7 +1939,7 @@ smb2_async_writev(struct cifs_writedata *wdata) smb2_writev_callback, wdata, 0); if (rc) { - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); } diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 93adc64666f..0ce48db20a6 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -123,7 +123,8 @@ extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_async_readv(struct cifs_readdata *rdata); extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type); -extern int smb2_async_writev(struct cifs_writedata *wdata); +extern int smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec); extern int SMB2_echo(struct TCP_Server_Info *server); -- cgit v1.2.3-70-g09d2 From c18f7b51200c3c8b76c63e391f9995b65ace9c83 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Fri, 7 Feb 2014 14:36:10 -0600 Subject: jfs: fix generic posix ACL regression I missed a couple errors in reviewing the patches converting jfs to use the generic posix ACL function. Setting ACL's currently fails with -EOPNOTSUPP. Signed-off-by: Dave Kleikamp Reported-by: Michael L. Semon Reviewed-by: Christoph Hellwig --- fs/jfs/xattr.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 3bd5ee45f7b..46325d5c34f 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -854,9 +854,6 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, int rc; tid_t tid; - if ((rc = can_set_xattr(inode, name, value, value_len))) - return rc; - /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -865,6 +862,9 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, value_len, flags); + if ((rc = can_set_xattr(inode, name, value, value_len))) + return rc; + if (value == NULL) { /* empty EA, do not remove */ value = ""; value_len = 0; @@ -1034,9 +1034,6 @@ int jfs_removexattr(struct dentry *dentry, const char *name) int rc; tid_t tid; - if ((rc = can_set_xattr(inode, name, NULL, 0))) - return rc; - /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -1045,6 +1042,9 @@ int jfs_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_removexattr(dentry, name); + if ((rc = can_set_xattr(inode, name, NULL, 0))) + return rc; + tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); @@ -1061,7 +1061,7 @@ int jfs_removexattr(struct dentry *dentry, const char *name) * attributes are handled directly. */ const struct xattr_handler *jfs_xattr_handlers[] = { -#ifdef JFS_POSIX_ACL +#ifdef CONFIG_JFS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif -- cgit v1.2.3-70-g09d2 From 6cc98d90f8d14f8ebce2391323929024d7eef39f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Feb 2014 16:19:21 -0500 Subject: Btrfs: fix assert screwup for the pending move stuff Wang noticed that he was failing btrfs/030 even though me and Filipe couldn't reproduce. Turns out this is because Wang didn't have CONFIG_BTRFS_ASSERT set, which meant that a key part of Filipe's original patch was not being built in. This appears to be a mess up with merging Filipe's patch as it does not exist in his original patch. Fix this by changing how we make sure del_waiting_dir_move asserts that it did not error and take the function out of the ifdef check. This makes btrfs/030 pass with the assert on or off. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/send.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index cf9107a6420..9c8d1a3fdc3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2774,8 +2774,6 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return 0; } -#ifdef CONFIG_BTRFS_ASSERT - static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) { struct rb_node *n = sctx->waiting_dir_moves.rb_node; @@ -2796,8 +2794,6 @@ static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOENT; } -#endif - static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; @@ -2902,7 +2898,9 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } sctx->send_progress = sctx->cur_ino + 1; - ASSERT(del_waiting_dir_move(sctx, pm->ino) == 0); + ret = del_waiting_dir_move(sctx, pm->ino); + ASSERT(ret == 0); + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; -- cgit v1.2.3-70-g09d2 From d0270aca88966641eb15306e9bd0c7ad15321440 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 7 Feb 2014 14:33:57 +0100 Subject: btrfs: commit transaction after setting label and features The set_fslabel ioctl uses btrfs_end_transaction, which means it's possible that the change will be lost if the system crashes, same for the newly set features. Let's use btrfs_commit_transaction instead. Signed-off-by: Jeff Mahoney Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 34772cbcc7a..5bbf6b7216c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4547,7 +4547,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) spin_lock(&root->fs_info->super_lock); strcpy(super_block->label, label); spin_unlock(&root->fs_info->super_lock); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); out_unlock: mnt_drop_write_file(file); @@ -4711,7 +4711,7 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_end_transaction(trans, root); + return btrfs_commit_transaction(trans, root); } long btrfs_ioctl(struct file *file, unsigned int -- cgit v1.2.3-70-g09d2 From 8051aa1a3d5aaa7bd4c062cad94d09c3d567ef2e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Feb 2014 14:34:04 +0100 Subject: btrfs: reserve no transaction units in btrfs_ioctl_set_features Added in patch "btrfs: add ioctls to query/change feature bits online" modifications to superblock don't need to reserve metadata blocks when starting a transaction. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5bbf6b7216c..ebdd866d4cf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4690,7 +4690,7 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); -- cgit v1.2.3-70-g09d2 From 27a377db745ed4d11b3b9b340756857cb8dde07f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 7 Feb 2014 13:57:59 -0500 Subject: Btrfs: don't loop forever if we can't run because of the tree mod log A user reported a 100% cpu hang with my new delayed ref code. Turns out I forgot to increase the count check when we can't run a delayed ref because of the tree mod log. If we can't run any delayed refs during this there is no point in continuing to look, and we need to break out. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9c9ecc93ae2..32312e09f0f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2385,6 +2385,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); locked_ref = NULL; cond_resched(); + count++; continue; } -- cgit v1.2.3-70-g09d2 From a2aa75e18a21b21952dc6daa9bac7c9f4426f81f Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Sat, 8 Feb 2014 15:47:46 +0000 Subject: Btrfs: fix data corruption when reading/updating compressed extents When using a mix of compressed file extents and prealloc extents, it is possible to fill a page of a file with random, garbage data from some unrelated previous use of the page, instead of a sequence of zeroes. A simple sequence of steps to get into such case, taken from the test case I made for xfstests, is: _scratch_mkfs _scratch_mount "-o compress-force=lzo" $XFS_IO_PROG -f -c "pwrite -S 0x06 -b 18670 266978 18670" $SCRATCH_MNT/foobar $XFS_IO_PROG -c "falloc 26450 665194" $SCRATCH_MNT/foobar $XFS_IO_PROG -c "truncate 542872" $SCRATCH_MNT/foobar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar This results in the following file items in the fs tree: item 4 key (257 INODE_ITEM 0) itemoff 15879 itemsize 160 inode generation 6 transid 6 size 542872 block group 0 mode 100600 item 5 key (257 INODE_REF 256) itemoff 15863 itemsize 16 inode ref index 2 namelen 6 name: foobar item 6 key (257 EXTENT_DATA 0) itemoff 15810 itemsize 53 extent data disk byte 0 nr 0 gen 6 extent data offset 0 nr 24576 ram 266240 extent compression 0 item 7 key (257 EXTENT_DATA 24576) itemoff 15757 itemsize 53 prealloc data disk byte 12849152 nr 241664 gen 6 prealloc data offset 0 nr 241664 item 8 key (257 EXTENT_DATA 266240) itemoff 15704 itemsize 53 extent data disk byte 12845056 nr 4096 gen 6 extent data offset 0 nr 20480 ram 20480 extent compression 2 item 9 key (257 EXTENT_DATA 286720) itemoff 15651 itemsize 53 prealloc data disk byte 13090816 nr 405504 gen 6 prealloc data offset 0 nr 258048 The on disk extent at offset 266240 (which corresponds to 1 single disk block), contains 5 compressed chunks of file data. Each of the first 4 compress 4096 bytes of file data, while the last one only compresses 3024 bytes of file data. Therefore a read into the file region [285648 ; 286720[ (length = 4096 - 3024 = 1072 bytes) should always return zeroes (our next extent is a prealloc one). The solution here is the compression code path to zero the remaining (untouched) bytes of the last page it uncompressed data into, as the information about how much space the file data consumes in the last page is not known in the upper layer fs/btrfs/extent_io.c:__do_readpage(). In __do_readpage we were correctly zeroing the remainder of the page but only if it corresponds to the last page of the inode and if the inode's size is not a multiple of the page size. This would cause not only returning random data on reads, but also permanently storing random data when updating parts of the region that should be zeroed. For the example above, it means updating a single byte in the region [285648 ; 286720[ would store that byte correctly but also store random data on disk. A test case for xfstests follows soon. Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index af815eb8f97..ed1ff1cb101 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1011,6 +1011,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(bytes, working_bytes); kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + if (*pg_index == (vcnt - 1) && *pg_offset == 0) + memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); kunmap_atomic(kaddr); flush_dcache_page(page_out); -- cgit v1.2.3-70-g09d2 From d311d79de305f1ada47cadd672e6ed1b28a949eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 15:18:09 -0500 Subject: fix O_SYNC|O_APPEND syncing the wrong range on write() It actually goes back to 2004 ([PATCH] Concurrent O_SYNC write support) when sync_page_range() had been introduced; generic_file_write{,v}() correctly synced pos_after_write - written .. pos_after_write - 1 but generic_file_aio_write() synced pos_before_write .. pos_before_write + written - 1 instead. Which is not the same thing with O_APPEND, obviously. A couple of years later correct variant had been killed off when everything switched to use of generic_file_aio_write(). All users of generic_file_aio_write() are affected, and the same bug has been copied into other instances of ->aio_write(). The fix is trivial; the only subtle point is that generic_write_sync() ought to be inlined to avoid calculations useless for the majority of calls. Signed-off-by: Al Viro --- fs/cifs/file.c | 4 ++-- fs/ext4/file.c | 2 +- fs/ntfs/file.c | 2 +- fs/sync.c | 17 ----------------- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 8 +++++++- mm/filemap.c | 4 ++-- 7 files changed, 14 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 853d6d1cc82..a7eda8ebfac 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2559,8 +2559,8 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, if (rc > 0) { ssize_t err; - err = generic_write_sync(file, pos, rc); - if (err < 0 && rc > 0) + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) rc = err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 43e64f6022e..1a5073959f3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -152,7 +152,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0 && ret > 0) ret = err; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ea4ba9daeb4..db9bd8a3172 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2134,7 +2134,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0) { - int err = generic_write_sync(file, pos, ret); + int err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/sync.c b/fs/sync.c index f1553745223..e8ba024a055 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -222,23 +222,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } -/** - * generic_write_sync - perform syncing after a write if file / inode is sync - * @file: file to which the write happened - * @pos: offset where the write started - * @count: length of the write - * - * This is just a simple wrapper about our general syncing function. - */ -int generic_write_sync(struct file *file, loff_t pos, loff_t count) -{ - if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) - return 0; - return vfs_fsync_range(file, pos, pos + count - 1, - (file->f_flags & __O_SYNC) ? 0 : 1); -} -EXPORT_SYMBOL(generic_write_sync); - /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2e7989e3a2d..64b48eade91 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -799,7 +799,7 @@ xfs_file_aio_write( XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 09f553c5981..75ff961be05 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2273,7 +2273,13 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); -extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); +static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); +} extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK diff --git a/mm/filemap.c b/mm/filemap.c index d56d3c145b9..7a13f6ac542 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } return ret; -- cgit v1.2.3-70-g09d2 From 0b4ef8de090a60e04560ed69aa35f439b8d5145f Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 18:22:53 +0530 Subject: fs: Mark function as static in fs/bio-integrity.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark functions as static in bio-integrity.c because it is not used outside this file. This eliminates the following warnings in bio-integrity.c: fs/bio-integrity.c:224:5: warning: no previous prototype for ‘bio_integrity_tag’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index bcb4a4d533a..0129b78a690 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -234,7 +234,8 @@ unsigned int bio_integrity_tag_size(struct bio *bio) } EXPORT_SYMBOL(bio_integrity_tag_size); -int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, + int set) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); -- cgit v1.2.3-70-g09d2 From 38dfac843cb6d7be1874888839817404a15a6b3c Mon Sep 17 00:00:00 2001 From: Greg Pearson Date: Mon, 10 Feb 2014 14:25:36 -0800 Subject: vmcore: prevent PT_NOTE p_memsz overflow during header update Currently, update_note_header_size_elf64() and update_note_header_size_elf32() will add the size of a PT_NOTE entry to real_sz even if that causes real_sz to exceeds max_sz. This patch corrects the while loop logic in those routines to ensure that does not happen and prints a warning if a PT_NOTE entry is dropped. If zero PT_NOTE entries are found or this condition is encountered because the only entry was dropped, a warning is printed and an error is returned. One possible negative side effect of exceeding the max_sz limit is an allocation failure in merge_note_headers_elf64() or merge_note_headers_elf32() which would produce console output such as the following while booting the crash kernel. vmalloc: allocation failure: 14076997632 bytes swapper/0: page allocation failure: order:0, mode:0x80d2 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.10.0-gbp1 #7 Call Trace: dump_stack+0x19/0x1b warn_alloc_failed+0xf0/0x160 __vmalloc_node_range+0x19e/0x250 vmalloc_user+0x4c/0x70 merge_note_headers_elf64.constprop.9+0x116/0x24a vmcore_init+0x2d4/0x76c do_one_initcall+0xe2/0x190 kernel_init_freeable+0x17c/0x207 kernel_init+0xe/0x180 ret_from_fork+0x7c/0xb0 Kdump: vmcore not initialized kdump: dump target is /dev/sda4 kdump: saving to /sysroot//var/crash/127.0.0.1-2014.01.28-13:58:52/ kdump: saving vmcore-dmesg.txt Cannot open /proc/vmcore: No such file or directory kdump: saving vmcore-dmesg.txt failed kdump: saving vmcore kdump: saving vmcore failed This type of failure has been seen on a four socket prototype system with certain memory configurations. Most PT_NOTE sections have a single entry similar to: n_namesz = 0x5 n_descsz = 0x150 n_type = 0x1 Occasionally, a second entry is encountered with very large n_namesz and n_descsz sizes: n_namesz = 0x80000008 n_descsz = 0x510ae163 n_type = 0x80000008 Not yet sure of the source of these extra entries, they seem bogus, but they shouldn't cause crash dump to fail. Signed-off-by: Greg Pearson Acked-by: Vivek Goyal Cc: HATAYAMA Daisuke Cc: Michael Holzheu Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 2ca7ba047f0..88d4585b30f 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -468,17 +468,24 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) return rc; } nhdr_ptr = notes_section; - while (real_sz < max_sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf64_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } kfree(notes_section); phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + return -EINVAL; + } } return 0; @@ -648,17 +655,24 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) return rc; } nhdr_ptr = notes_section; - while (real_sz < max_sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf32_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } kfree(notes_section); phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + return -EINVAL; + } } return 0; -- cgit v1.2.3-70-g09d2 From 96c7a2ff21501691587e1ae969b83cbec8b78e08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Feb 2014 14:25:41 -0800 Subject: fs/file.c:fdtable: avoid triggering OOMs from alloc_fdmem Recently due to a spike in connections per second memcached on 3 separate boxes triggered the OOM killer from accept. At the time the OOM killer was triggered there was 4GB out of 36GB free in zone 1. The problem was that alloc_fdtable was allocating an order 3 page (32KiB) to hold a bitmap, and there was sufficient fragmentation that the largest page available was 8KiB. I find the logic that PAGE_ALLOC_COSTLY_ORDER can't fail pretty dubious but I do agree that order 3 allocations are very likely to succeed. There are always pathologies where order > 0 allocations can fail when there are copious amounts of free memory available. Using the pigeon hole principle it is easy to show that it requires 1 page more than 50% of the pages being free to guarantee an order 1 (8KiB) allocation will succeed, 1 page more than 75% of the pages being free to guarantee an order 2 (16KiB) allocation will succeed and 1 page more than 87.5% of the pages being free to guarantee an order 3 allocate will succeed. A server churning memory with a lot of small requests and replies like memcached is a common case that if anything can will skew the odds against large pages being available. Therefore let's not give external applications a practical way to kill linux server applications, and specify __GFP_NORETRY to the kmalloc in alloc_fdmem. Unless I am misreading the code and by the time the code reaches should_alloc_retry in __alloc_pages_slowpath (where __GFP_NORETRY becomes signification). We have already tried everything reasonable to allocate a page and the only thing left to do is wait. So not waiting and falling back to vmalloc immediately seems like the reasonable thing to do even if there wasn't a chance of triggering the OOM killer. Signed-off-by: "Eric W. Biederman" Cc: Eric Dumazet Acked-by: David Rientjes Cc: Cong Wang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 771578b33fb..db25c2bdfe4 100644 --- a/fs/file.c +++ b/fs/file.c @@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); if (data != NULL) return data; } -- cgit v1.2.3-70-g09d2 From a987c7ca7fc9225a587b1dc59d7d4ad2d9e2e08e Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Mon, 10 Feb 2014 14:25:44 -0800 Subject: ocfs2: fix ocfs2_sync_file() if filesystem is readonly If filesystem is readonly, there is no need to flush drive's caches or force any uncommitted transactions. [akpm@linux-foundation.org: return -EROFS, not 0] Signed-off-by: Younger Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d77d71ead8d..fcd970636bb 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -185,6 +185,9 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, file->f_path.dentry->d_name.name, (unsigned long long)datasync); + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; -- cgit v1.2.3-70-g09d2 From a0b54adda3fe4b4cc6d28f2a9217cd35d1aa888c Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Mon, 10 Feb 2014 14:25:48 -0800 Subject: mm: fix page leak at nfs_symlink() Changes in commit a0b8cab3b9b2 ("mm: remove lru parameter from __pagevec_lru_add and remove parts of pagevec API") have introduced a call to add_to_page_cache_lru() which causes a leak in nfs_symlink() as now the page gets an extra refcount that is not dropped. Jan Stancek observed and reported the leak effect while running test8 from Connectathon Testsuite. After several iterations over the test case, which creates several symlinks on a NFS mountpoint, the test system was quickly getting into an out-of-memory scenario. This patch fixes the page leak by dropping that extra refcount add_to_page_cache_lru() is grabbing. Signed-off-by: Jan Stancek Signed-off-by: Rafael Aquini Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Jeff Layton Cc: Trond Myklebust Cc: [3.11.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/dir.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index be38b573495..4a48fe4b84b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1846,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) GFP_KERNEL)) { SetPageUptodate(page); unlock_page(page); + /* + * add_to_page_cache_lru() grabs an extra page refcount. + * Drop it here to avoid leaking this page later. + */ + page_cache_release(page); } else __free_page(page); -- cgit v1.2.3-70-g09d2 From d62e74be1270c89fbaf7aada8218bfdf62d00a58 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Mon, 10 Feb 2014 14:25:51 -0800 Subject: ocfs2: fix issue that ocfs2_setattr() does not deal with new_i_size==i_size The issue scenario is as following: - Create a small file and fallocate a large disk space for a file with FALLOC_FL_KEEP_SIZE option. - ftruncate the file back to the original size again. but the disk free space is not changed back. This is a real bug that be fixed in this patch. In order to solve the issue above, we modified ocfs2_setattr(), if attr->ia_size != i_size_read(inode), It calls ocfs2_truncate_file(), and truncate disk space to attr->ia_size. Signed-off-by: Younger Liu Reviewed-by: Jie Liu Tested-by: Jie Liu Cc: Joel Becker Reviewed-by: Mark Fasheh Cc: Sunil Mushran Reviewed-by: Jensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/file.c | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index aada5801567..e2edff38be5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7158,7 +7158,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, if (end > i_size_read(inode)) end = i_size_read(inode); - BUG_ON(start >= end); + BUG_ON(start > end); if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fcd970636bb..9148353c5cf 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -477,11 +477,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* lets handle the simple truncate cases before doing any more - * cluster locking. */ - if (new_i_size == le64_to_cpu(fe->i_size)) - goto bail; - down_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_resv_discard(&osb->osb_la_resmap, @@ -1148,14 +1143,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock_rw; } - if (size_change && attr->ia_size != i_size_read(inode)) { + if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); if (status) goto bail_unlock; inode_dio_wait(inode); - if (i_size_read(inode) > attr->ia_size) { + if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, attr->ia_size); -- cgit v1.2.3-70-g09d2 From c7d2cbc364b2a237b0ed1bdd7cbf8a24c8a89dfd Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 10 Feb 2014 14:25:53 -0800 Subject: ocfs2: update inode size after zeroing the hole fs-writeback will release the dirty pages without page lock whose offset are over inode size, the release happens at block_write_full_page_endio(). If not update, dirty pages in file holes may be released before flushed to the disk, then file holes will contain some non-zero data, this will cause sparse file md5sum error. To reproduce the bug, find a big sparse file with many holes, like vm image file, its actual size should be bigger than available mem size to make writeback work more frequently, tar it with -S option, then keep untar it and check its md5sum again and again until you get a wrong md5sum. Signed-off-by: Junxiao Bi Cc: Younger Liu Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9148353c5cf..8450262bcf2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -716,7 +716,8 @@ leave: * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. */ -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, + struct buffer_head *di_bh) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -733,7 +734,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) } ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) mlog_errno(ret); out: @@ -749,7 +757,7 @@ out: * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, - u64 abs_to) + u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -757,6 +765,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, handle_t *handle = NULL; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); @@ -799,7 +808,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode); + handle = ocfs2_zero_start_ordered_transaction(inode, + di_bh); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -816,8 +826,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, ret = 0; } - if (handle) + if (handle) { + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page_endio(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; + ocfs2_journal_dirty(handle, di_bh); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + } out_unlock: unlock_page(page); @@ -913,7 +937,7 @@ out: * has made sure that the entire range needs zeroing. */ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, - u64 range_end) + u64 range_end, struct buffer_head *di_bh) { int rc = 0; u64 next_pos; @@ -929,7 +953,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; if (next_pos > range_end) next_pos = range_end; - rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); if (rc < 0) { mlog_errno(rc); break; @@ -975,7 +999,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, range_end = zero_to_size; ret = ocfs2_zero_extend_range(inode, range_start, - range_end); + range_end, di_bh); if (ret) { mlog_errno(ret); break; -- cgit v1.2.3-70-g09d2 From 0e048316ff577e12c748e2d0a2e4f0f7b006654d Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Mon, 10 Feb 2014 14:25:54 -0800 Subject: ocfs2: check existence of old dentry in ocfs2_link() System call linkat first calls user_path_at(), check the existence of old dentry, and then calls vfs_link()->ocfs2_link() to do the actual work. There may exist a race when Node A create a hard link for file while node B rm it. Node A Node B user_path_at() ->ocfs2_lookup(), find old dentry exist rm file, add inode say inodeA to orphan_dir call ocfs2_link(),create a hard link for inodeA. rm the link, add inodeA to orphan_dir again When orphan_scan work start, it calls ocfs2_queue_orphans() to do the main work. It first tranverses entrys in orphan_dir, linking all inodes in this orphan_dir to a list look like this: inodeA->inodeB->...->inodeA When tranvering this list, it will fall into loop, calling iput() again and again. And finally trigger BUG_ON(inode->i_state & I_CLEAR). Signed-off-by: joyce Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f4d609be940..3683643f3f0 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -664,6 +664,7 @@ static int ocfs2_link(struct dentry *old_dentry, struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; + u64 old_de_ino; trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, old_dentry->d_name.len, old_dentry->d_name.name, @@ -686,6 +687,22 @@ static int ocfs2_link(struct dentry *old_dentry, goto out; } + err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_de_ino); + if (err) { + err = -ENOENT; + goto out; + } + + /* + * Check whether another node removed the source inode while we + * were in the vfs. + */ + if (old_de_ino != OCFS2_I(inode)->ip_blkno) { + err = -ENOENT; + goto out; + } + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, dentry->d_name.len); if (err) -- cgit v1.2.3-70-g09d2 From 8423ae3d7a3cfe084865262cfaeba1359d405182 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2014 17:45:50 -0800 Subject: block: Fix cloning of discard/write same bios Immutable biovecs changed the way bio segments are treated in such a way that bio_for_each_segment() cannot now do what we want for discard/write same bios, since bi_size means something completely different for them. Fortunately discard and write same bios never have more than a single biovec, so bio_for_each_segment() is unnecessary and not terribly meaningful for them, but we still have to special case them in a few places. Signed-off-by: Kent Overstreet Tested-by: Richard W.M. Jones Signed-off-by: Jens Axboe --- fs/bio.c | 15 ++++++++++----- include/linux/bio.h | 11 +++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 75c49a38223..8754e7b6eb4 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -611,7 +611,6 @@ EXPORT_SYMBOL(bio_clone_fast); struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, struct bio_set *bs) { - unsigned nr_iovecs = 0; struct bvec_iter iter; struct bio_vec bv; struct bio *bio; @@ -638,10 +637,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, * __bio_clone_fast() anyways. */ - bio_for_each_segment(bv, bio_src, iter) - nr_iovecs++; - - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs); + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; @@ -650,9 +646,18 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; +integrity_clone: if (bio_integrity(bio_src)) { int ret; diff --git a/include/linux/bio.h b/include/linux/bio.h index d6791bba826..5a4d39b4686 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio) struct bio_vec bv; struct bvec_iter iter; + /* + * We special case discard/write same, because they interpret bi_size + * differently: + */ + + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + bio_for_each_segment(bv, bio, iter) segs++; -- cgit v1.2.3-70-g09d2 From 09bdc2d70dedd0fc0358da93bca664c7b11ff907 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 11 Feb 2014 11:29:05 -0500 Subject: nfsd4: fix acl buffer overrun 4ac7249ea5a0ceef9f8269f63f33cc873c3fac61 "nfsd: use get_acl and ->set_acl" forgets to set the size in the case get_acl() succeeds, so _posix_to_nfsv4_one() can then write past the end of its allocation. Symptoms were slab corruption warnings. Also, some minor cleanup while we're here. (Among other things, note that the first few lines guarantee that pacl is non-NULL.) Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4acl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index d3a58714422..d190e33d0ec 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -151,17 +151,15 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(pacl)) return PTR_ERR(pacl); - /* allocate for worst case: one (deny, allow) pair each: */ - size += 2 * pacl->a_count; } + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; dpacl = get_acl(inode, ACL_TYPE_DEFAULT); if (dpacl) size += 2 * dpacl->a_count; - } else { - dpacl = NULL; } *acl = nfs4_acl_new(size); @@ -170,8 +168,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, goto out; } - if (pacl) - _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); -- cgit v1.2.3-70-g09d2 From 2ec197db1a56c9269d75e965f14c344b58b2a4f6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Feb 2014 17:10:26 +1100 Subject: lockd: send correct lock when granting a delayed lock. If an NFS client attempts to get a lock (using NLM) and the lock is not available, the server will remember the request and when the lock becomes available it will send a GRANT request to the client to provide the lock. If the client already held an adjacent lock, the GRANT callback will report the union of the existing and new locks, which can confuse the client. This happens because __posix_lock_file (called by vfs_lock_file) updates the passed-in file_lock structure when adjacent or over-lapping locks are found. To avoid this problem we take a copy of the two fields that can be changed (fl_start and fl_end) before the call and restore them afterwards. An alternate would be to allocate a 'struct file_lock', initialise it, use locks_copy_lock() to take a copy, then locks_release_private() after the vfs_lock_file() call. But that is a lot more work. Reported-by: Olaf Kirch Signed-off-by: NeilBrown Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields -- v1 had a couple of issues (large on-stack struct and didn't really work properly). This version is much better tested. Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e066a390297..ab798a88ec1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -779,6 +779,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; int error; + loff_t fl_start, fl_end; dprintk("lockd: grant blocked lock %p\n", block); @@ -796,9 +797,16 @@ nlmsvc_grant_blocked(struct nlm_block *block) } /* Try the lock operation again */ + /* vfs_lock_file() can mangle fl_start and fl_end, but we need + * them unchanged for the GRANT_MSG + */ lock->fl.fl_flags |= FL_SLEEP; + fl_start = lock->fl.fl_start; + fl_end = lock->fl.fl_end; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.fl_start = fl_start; + lock->fl.fl_end = fl_end; switch (error) { case 0: -- cgit v1.2.3-70-g09d2 From 11bcac89c0d73dea42f1cb8646b532035796a5d6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 14 Feb 2014 13:42:13 -0800 Subject: Revert "btrfs: add ioctl to export size of global metadata reservation" This reverts commit 01e219e8069516cdb98594d417b8bb8d906ed30d. David Sterba found a different way to provide these features without adding a new ioctl. We haven't released any progs with this ioctl yet, so I'm taking this out for now until we finalize things. Signed-off-by: Chris Mason Signed-off-by: David Sterba CC: Jeff Mahoney --- fs/btrfs/ioctl.c | 16 ---------------- include/uapi/linux/btrfs.h | 1 - 2 files changed, 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ebdd866d4cf..9a9044585da 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3559,20 +3559,6 @@ out: return ret; } -static long btrfs_ioctl_global_rsv(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; - u64 reserved; - - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (arg && copy_to_user(arg, &reserved, sizeof(reserved))) - return -EFAULT; - return 0; -} - /* * there are many ways the trans_start and trans_end ioctls can lead * to deadlocks. They should only be used by applications that @@ -4779,8 +4765,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); - case BTRFS_IOC_GLOBAL_RSV: - return btrfs_ioctl_global_rsv(root, argp); case BTRFS_IOC_SYNC: { int ret; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 1b8a0f4c959..b4d69092fbd 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -558,7 +558,6 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, __u64) #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ struct btrfs_ioctl_space_args) -#define BTRFS_IOC_GLOBAL_RSV _IOR(BTRFS_IOCTL_MAGIC, 20, __u64) #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ -- cgit v1.2.3-70-g09d2 From a9d2d4adb6a87e515912a7033b227acbd9c8835e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sat, 8 Feb 2014 15:33:08 +0800 Subject: Btrfs: fix a lockdep warning when cleaning up aborted transaction Given now we have 2 spinlock for management of delayed refs, CONFIG_DEBUG_SPINLOCK=y helped me find this, [ 4723.413809] BUG: spinlock wrong CPU on CPU#1, btrfs-transacti/2258 [ 4723.414882] lock: 0xffff880048377670, .magic: dead4ead, .owner: btrfs-transacti/2258, .owner_cpu: 2 [ 4723.417146] CPU: 1 PID: 2258 Comm: btrfs-transacti Tainted: G W O 3.12.0+ #4 [ 4723.421321] Call Trace: [ 4723.421872] [] dump_stack+0x54/0x74 [ 4723.422753] [] spin_dump+0x8c/0x91 [ 4723.424979] [] spin_bug+0x21/0x26 [ 4723.425846] [] do_raw_spin_unlock+0x66/0x90 [ 4723.434424] [] _raw_spin_unlock+0x27/0x40 [ 4723.438747] [] btrfs_cleanup_one_transaction+0x35e/0x710 [btrfs] [ 4723.443321] [] btrfs_cleanup_transaction+0x104/0x570 [btrfs] [ 4723.444692] [] ? trace_hardirqs_on_caller+0xfd/0x1c0 [ 4723.450336] [] ? trace_hardirqs_on+0xd/0x10 [ 4723.451332] [] transaction_kthread+0x22e/0x270 [btrfs] [ 4723.452543] [] ? btrfs_cleanup_transaction+0x570/0x570 [btrfs] [ 4723.457833] [] kthread+0xea/0xf0 [ 4723.458990] [] ? kthread_create_on_node+0x140/0x140 [ 4723.460133] [] ret_from_fork+0x7c/0xb0 [ 4723.460865] [] ? kthread_create_on_node+0x140/0x140 [ 4723.496521] ------------[ cut here ]------------ ---------------------------------------------------------------------- The reason is that we get to call cond_resched_lock(&head_ref->lock) while still holding @delayed_refs->lock. So it's different with __btrfs_run_delayed_refs(), where we do drop-acquire dance before and after actually processing delayed refs. Here we don't drop the lock, others are not able to add new delayed refs to head_ref, so cond_resched_lock(&head_ref->lock) is not necessary here. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3903bd3f8d2..fcf36758107 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3842,7 +3842,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, rb_erase(&ref->rb_node, &head->ref_root); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); - cond_resched_lock(&head->lock); } if (head->must_insert_reserved) pin_bytes = true; -- cgit v1.2.3-70-g09d2 From feb5f96589302b39a2b10fc210db7c47a73e4168 Mon Sep 17 00:00:00 2001 From: Mitch Harder Date: Thu, 13 Feb 2014 09:13:16 -0600 Subject: Btrfs: fix max_inline mount option Currently, the only mount option for max_inline that has any effect is max_inline=0. Any other value that is supplied to max_inline will be adjusted to a minimum of 4k. Since max_inline has an effective maximum of ~3900 bytes due to page size limitations, the current behaviour only has meaning for max_inline=0. This patch will allow the the max_inline mount option to accept non-zero values as indicated in the documentation. Signed-off-by: Mitch Harder Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 97cc2419855..e73c80eec11 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -566,7 +566,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); if (info->max_inline) { - info->max_inline = max_t(u64, + info->max_inline = min_t(u64, info->max_inline, root->sectorsize); } -- cgit v1.2.3-70-g09d2 From 3a0dfa6a12e4bb64a434426ecb17d4842092db5e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2014 13:43:48 -0500 Subject: Btrfs: unset DCACHE_DISCONNECTED when mounting default subvol A user was running into errors from an NFS export of a subvolume that had a default subvol set. When we mount a default subvol we will use d_obtain_alias() to find an existing dentry for the subvolume in the case that the root subvol has already been mounted, or a dummy one is allocated in the case that the root subvol has not already been mounted. This allows us to connect the dentry later on if we wander into the path. However if we don't ever wander into the path we will keep DCACHE_DISCONNECTED set for a long time, which angers NFS. It doesn't appear to cause any problems but it is annoying nonetheless, so simply unset DCACHE_DISCONNECTED in the get_default_root case and switch btrfs_lookup() to use d_materialise_unique() instead which will make everything play nicely together and reconnect stuff if we wander into the defaul subvol path from a different way. With this patch I'm no longer getting the NFS errors when exporting a volume that has been mounted with a default subvol set. Thanks, cc: bfields@fieldses.org cc: ebiederm@xmission.com Signed-off-by: Josef Bacik Acked-by: "Eric W. Biederman" Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- fs/btrfs/super.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1af34d0c744..4ffb6d79f9f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5150,7 +5150,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_CAST(inode); } - return d_splice_alias(inode, dentry); + return d_materialise_unique(dentry, inode); } unsigned char btrfs_filetype_table[] = { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e73c80eec11..d04db817be5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -855,6 +855,7 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; + struct dentry *dentry; u64 dir_id; int new = 0; @@ -925,7 +926,13 @@ setup_root: return dget(sb->s_root); } - return d_obtain_alias(inode); + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&dentry->d_lock); + } + return dentry; } static int btrfs_fill_super(struct super_block *sb, -- cgit v1.2.3-70-g09d2 From f085381e6d08f4c8d6882825f31accd455c54d70 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 15 Jan 2014 17:22:28 +0800 Subject: btrfs: fix null pointer deference at btrfs_sysfs_add_one+0x105 bdev is null when disk has disappeared and mounted with the degrade option stack trace --------- btrfs_sysfs_add_one+0x105/0x1c0 [btrfs] open_ctree+0x15f3/0x1fe0 [btrfs] btrfs_mount+0x5db/0x790 [btrfs] ? alloc_pages_current+0xa4/0x160 mount_fs+0x34/0x1b0 vfs_kern_mount+0x62/0xf0 do_mount+0x22e/0xa80 ? __get_free_pages+0x9/0x40 ? copy_mount_options+0x31/0x170 SyS_mount+0x7e/0xc0 system_call_fastpath+0x16/0x1b --------- reproducer: ------- mkfs.btrfs -draid1 -mraid1 /dev/sdc /dev/sdd (detach a disk) devmgt detach /dev/sdc [1] mount -o degrade /dev/sdd /btrfs ------- [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain Tested-by: Hidetoshi Seto Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 782374d8fd1..865f4cf9a76 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -578,8 +578,14 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) return -ENOMEM; list_for_each_entry(dev, &fs_devices->devices, dev_list) { - struct hd_struct *disk = dev->bdev->bd_part; - struct kobject *disk_kobj = &part_to_dev(disk)->kobj; + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!dev->bdev) + continue; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; error = sysfs_create_link(fs_info->device_dir_kobj, disk_kobj, disk_kobj->name); -- cgit v1.2.3-70-g09d2 From 93de4ba86480a9e0d1062cb1d535fa97fb81af48 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Sat, 15 Feb 2014 15:53:16 +0000 Subject: Btrfs: use right clone root offset for compressed extents For non compressed extents, iterate_extent_inodes() gives us offsets that take into account the data offset from the file extent items, while for compressed extents it doesn't. Therefore we have to adjust them before placing them in a send clone instruction. Not doing this adjustment leads to the receiving end requesting for a wrong a file range to the clone ioctl, which results in different file content from the one in the original send root. Issue reproducible with the following excerpt from the test I made for xfstests: _scratch_mkfs _scratch_mount "-o compress-force=lzo" $XFS_IO_PROG -f -c "truncate 118811" $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0x0d -b 39987 92267 39987" $SCRATCH_MNT/foo $BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 $XFS_IO_PROG -c "pwrite -S 0x3e -b 80000 200000 80000" $SCRATCH_MNT/foo $BTRFS_UTIL_PROG filesystem sync $SCRATCH_MNT $XFS_IO_PROG -c "pwrite -S 0xdc -b 10000 250000 10000" $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xff -b 10000 300000 10000" $SCRATCH_MNT/foo # will be used for incremental send to be able to issue clone operations $BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/clones_snap $BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 $FSSUM_PROG -A -f -w $tmp/1.fssum $SCRATCH_MNT/mysnap1 $FSSUM_PROG -A -f -w $tmp/2.fssum -x $SCRATCH_MNT/mysnap2/mysnap1 \ -x $SCRATCH_MNT/mysnap2/clones_snap $SCRATCH_MNT/mysnap2 $FSSUM_PROG -A -f -w $tmp/clones.fssum $SCRATCH_MNT/clones_snap \ -x $SCRATCH_MNT/clones_snap/mysnap1 -x $SCRATCH_MNT/clones_snap/mysnap2 $BTRFS_UTIL_PROG send $SCRATCH_MNT/mysnap1 -f $tmp/1.snap $BTRFS_UTIL_PROG send $SCRATCH_MNT/clones_snap -f $tmp/clones.snap $BTRFS_UTIL_PROG send -p $SCRATCH_MNT/mysnap1 \ -c $SCRATCH_MNT/clones_snap $SCRATCH_MNT/mysnap2 -f $tmp/2.snap _scratch_unmount _scratch_mkfs _scratch_mount $BTRFS_UTIL_PROG receive $SCRATCH_MNT -f $tmp/1.snap $FSSUM_PROG -r $tmp/1.fssum $SCRATCH_MNT/mysnap1 2>> $seqres.full $BTRFS_UTIL_PROG receive $SCRATCH_MNT -f $tmp/clones.snap $FSSUM_PROG -r $tmp/clones.fssum $SCRATCH_MNT/clones_snap 2>> $seqres.full $BTRFS_UTIL_PROG receive $SCRATCH_MNT -f $tmp/2.snap $FSSUM_PROG -r $tmp/2.fssum $SCRATCH_MNT/mysnap2 2>> $seqres.full Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/send.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9c8d1a3fdc3..9dde9717c1b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1332,6 +1332,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { + if (compressed != BTRFS_COMPRESS_NONE) { + /* + * Offsets given by iterate_extent_inodes() are relative + * to the start of the extent, we need to add logical + * offset from the file extent item. + * (See why at backref.c:check_extent_in_eb()) + */ + cur_clone_root->offset += btrfs_file_extent_offset(eb, + fi); + } *found = cur_clone_root; ret = 0; } else { -- cgit v1.2.3-70-g09d2