From fa5199648e273a5e3e80aca41c1eb53700438dc1 Mon Sep 17 00:00:00 2001 From: Simon Que Date: Thu, 17 Jan 2013 11:18:20 -0800 Subject: eCryptfs: initialize payload_len in keystore.c This is meant to remove a compiler warning. It should not make any functional change. payload_len should be initialized when it is passed to write_tag_64_packet() as a pointer. If that call fails, this function should return early, and payload_len won't be used. Signed-off-by: Simon Que Signed-off-by: Tyler Hicks --- fs/ecryptfs/keystore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 2333203a120..6154cde3a05 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1150,7 +1150,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_message *msg = NULL; char *auth_tok_sig; char *payload; - size_t payload_len; + size_t payload_len = 0; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); -- cgit v1.2.3-70-g09d2 From bbcb03e3ce8c76a4ecf0ea7d365f9f0e913fc329 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 17 Jan 2013 11:36:10 -0800 Subject: eCryptfs: Fix -Wunused-but-set-variable warnings These two variables are no longer used and can be removed. Fixes warnings when building with W=1. Signed-off-by: Tyler Hicks --- fs/ecryptfs/dentry.c | 2 -- fs/ecryptfs/file.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1b5d9af937d..bf12ba5dd22 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,14 +45,12 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry; - struct vfsmount *lower_mnt; int rc = 1; if (flags & LOOKUP_RCU) return -ECHILD; lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index d45ba456812..bfa52d2ef46 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -199,7 +199,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ - struct dentry *lower_dentry; struct ecryptfs_file_info *file_info; mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -222,7 +221,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -ENOMEM; goto out; } - lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; mutex_lock(&crypt_stat->cs_mutex); if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { -- cgit v1.2.3-70-g09d2 From 111d61a25ec11c1837ac0fd81bf4b845d3327fb7 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 17 Jan 2013 12:19:35 -0800 Subject: eCryptfs: Fix -Wmissing-prototypes warnings Mark two inode operation fuctions as static. Fixes warnings when building with W=1. Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cc7709e7c50..ddd961ba2cf 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -999,8 +999,8 @@ out: return rc; } -int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat; int rc = 0; @@ -1021,8 +1021,8 @@ int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, return rc; } -int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct kstat lower_stat; int rc; -- cgit v1.2.3-70-g09d2 From a07c48ad5be5cbf32a2741f10e6fbf4bbfcaa362 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 26 Jan 2013 10:48:42 +0300 Subject: eCryptfs: remove unneeded checks in virt_to_scatterlist() This is always called with a valid "sg" pointer. My static checker complains because the call to sg_init_table() dereferences "sg" before we reach the checks. Signed-off-by: Dan Carpenter Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a7b0c2dfb3d..d5c25db4398 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -301,17 +301,14 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, while (size > 0 && i < sg_size) { pg = virt_to_page(addr); offset = offset_in_page(addr); - if (sg) - sg_set_page(&sg[i], pg, 0, offset); + sg_set_page(&sg[i], pg, 0, offset); remainder_of_page = PAGE_CACHE_SIZE - offset; if (size >= remainder_of_page) { - if (sg) - sg[i].length = remainder_of_page; + sg[i].length = remainder_of_page; addr += remainder_of_page; size -= remainder_of_page; } else { - if (sg) - sg[i].length = size; + sg[i].length = size; addr += size; size = 0; } -- cgit v1.2.3-70-g09d2 From 3a46741804a4226cd837d1246eed95d4161f7159 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 12 Feb 2013 10:56:54 -0700 Subject: eCryptfs: decrypt_pki_encrypted_session_key(): remove kfree() redundant null check smatch analysis: fs/ecryptfs/keystore.c:1206 decrypt_pki_encrypted_session_key() info: redundant null check on msg calling kfree() Cc: Dustin Kirkland Cc: ecryptfs@vger.kernel.org Signed-off-by: Tim Gardner Signed-off-by: Tyler Hicks --- fs/ecryptfs/keystore.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 6154cde3a05..5aceff202dc 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1202,8 +1202,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->key_size); } out: - if (msg) - kfree(msg); + kfree(msg); return rc; } -- cgit v1.2.3-70-g09d2 From 1101d58669a92ed9c9f4c7281404fb1e067a1e28 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 12 Feb 2013 11:03:49 -0700 Subject: ecryptfs: ecryptfs_msg_ctx_alloc_to_free(): remove kfree() redundant null check smatch analysis: fs/ecryptfs/messaging.c:101 ecryptfs_msg_ctx_alloc_to_free() info: redundant null check on msg_ctx->msg calling kfree() Cc: Dustin Kirkland Cc: ecryptfs@vger.kernel.org Signed-off-by: Tim Gardner Signed-off-by: Tyler Hicks --- fs/ecryptfs/messaging.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 5fa2471796c..d5c7297c581 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -97,8 +97,7 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) { list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); - if (msg_ctx->msg) - kfree(msg_ctx->msg); + kfree(msg_ctx->msg); msg_ctx->msg = NULL; msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; } -- cgit v1.2.3-70-g09d2 From 1111eae90fb64a9d9ed133e410712f1e34fdce4a Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 27 Feb 2013 11:37:48 -0800 Subject: eCryptfs: Fix redundant error check on ecryptfs_find_daemon_by_euid() It is sufficient to check the return code of ecryptfs_find_daemon_by_euid(). If it returns 0, it always sets the daemon pointer to point to a valid ecryptfs_daemon. Signed-off-by: Tyler Hicks Reported-by: Kees Cook --- fs/ecryptfs/messaging.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index d5c7297c581..474051bd8d9 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -283,7 +283,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, int rc; rc = ecryptfs_find_daemon_by_euid(&daemon); - if (rc || !daemon) { + if (rc) { rc = -ENOTCONN; goto out; } -- cgit v1.2.3-70-g09d2 From 7f78e0351394052e1a6293e175825eb5c7869507 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 2 Mar 2013 19:39:14 -0800 Subject: fs: Limit sys_mount to only request filesystem modules. Modify the request_module to prefix the file system type with "fs-" and add aliases to all of the filesystems that can be built as modules to match. A common practice is to build all of the kernel code and leave code that is not commonly needed as modules, with the result that many users are exposed to any bug anywhere in the kernel. Looking for filesystems with a fs- prefix limits the pool of possible modules that can be loaded by mount to just filesystems trivially making things safer with no real cost. Using aliases means user space can control the policy of which filesystem modules are auto-loaded by editing /etc/modprobe.d/*.conf with blacklist and alias directives. Allowing simple, safe, well understood work-arounds to known problematic software. This also addresses a rare but unfortunate problem where the filesystem name is not the same as it's module name and module auto-loading would not work. While writing this patch I saw a handful of such cases. The most significant being autofs that lives in the module autofs4. This is relevant to user namespaces because we can reach the request module in get_fs_type() without having any special permissions, and people get uncomfortable when a user specified string (in this case the filesystem type) goes all of the way to request_module. After having looked at this issue I don't think there is any particular reason to perform any filtering or permission checks beyond making it clear in the module request that we want a filesystem module. The common pattern in the kernel is to call request_module() without regards to the users permissions. In general all a filesystem module does once loaded is call register_filesystem() and go to sleep. Which means there is not much attack surface exposed by loading a filesytem module unless the filesystem is mounted. In a user namespace filesystems are not mounted unless .fs_flags = FS_USERNS_MOUNT, which most filesystems do not set today. Acked-by: Serge Hallyn Acked-by: Kees Cook Reported-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/perfmon.c | 1 + arch/powerpc/platforms/cell/spufs/inode.c | 1 + arch/s390/hypfs/inode.c | 1 + drivers/firmware/efivars.c | 1 + drivers/infiniband/hw/ipath/ipath_fs.c | 1 + drivers/infiniband/hw/qib/qib_fs.c | 1 + drivers/misc/ibmasm/ibmasmfs.c | 1 + drivers/mtd/mtdchar.c | 1 + drivers/oprofile/oprofilefs.c | 1 + drivers/staging/ccg/f_fs.c | 1 + drivers/usb/gadget/f_fs.c | 1 + drivers/usb/gadget/inode.c | 1 + drivers/xen/xenfs/super.c | 1 + fs/9p/vfs_super.c | 1 + fs/adfs/super.c | 1 + fs/affs/super.c | 1 + fs/afs/super.c | 1 + fs/autofs4/init.c | 1 + fs/befs/linuxvfs.c | 1 + fs/bfs/inode.c | 1 + fs/binfmt_misc.c | 1 + fs/btrfs/super.c | 1 + fs/ceph/super.c | 1 + fs/coda/inode.c | 1 + fs/configfs/mount.c | 1 + fs/cramfs/inode.c | 1 + fs/debugfs/inode.c | 1 + fs/devpts/inode.c | 1 + fs/ecryptfs/main.c | 1 + fs/efs/super.c | 1 + fs/exofs/super.c | 1 + fs/ext2/super.c | 1 + fs/ext3/super.c | 1 + fs/ext4/super.c | 5 +++-- fs/f2fs/super.c | 1 + fs/fat/namei_msdos.c | 1 + fs/fat/namei_vfat.c | 1 + fs/filesystems.c | 2 +- fs/freevxfs/vxfs_super.c | 2 +- fs/fuse/control.c | 1 + fs/fuse/inode.c | 2 ++ fs/gfs2/ops_fstype.c | 4 +++- fs/hfs/super.c | 1 + fs/hfsplus/super.c | 1 + fs/hppfs/hppfs.c | 1 + fs/hugetlbfs/inode.c | 1 + fs/isofs/inode.c | 3 +-- fs/jffs2/super.c | 1 + fs/jfs/super.c | 1 + fs/logfs/super.c | 1 + fs/minix/inode.c | 1 + fs/ncpfs/inode.c | 1 + fs/nfs/super.c | 3 ++- fs/nfsd/nfsctl.c | 1 + fs/nilfs2/super.c | 1 + fs/ntfs/super.c | 1 + fs/ocfs2/dlmfs/dlmfs.c | 1 + fs/omfs/inode.c | 1 + fs/openpromfs/inode.c | 1 + fs/qnx4/inode.c | 1 + fs/qnx6/inode.c | 1 + fs/reiserfs/super.c | 1 + fs/romfs/super.c | 1 + fs/sysv/super.c | 3 ++- fs/ubifs/super.c | 1 + fs/ufs/super.c | 1 + fs/xfs/xfs_super.c | 1 + include/linux/fs.h | 2 ++ net/sunrpc/rpc_pipe.c | 4 +--- 69 files changed, 77 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 433f5e8a2cd..2eda28414ab 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -619,6 +619,7 @@ static struct file_system_type pfm_fs_type = { .mount = pfmfs_mount, .kill_sb = kill_anon_super, }; +MODULE_ALIAS_FS("pfmfs"); DEFINE_PER_CPU(unsigned long, pfm_syst_info); DEFINE_PER_CPU(struct task_struct *, pmu_owner); diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 863184b182f..3f3bb4cdbbe 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -749,6 +749,7 @@ static struct file_system_type spufs_type = { .mount = spufs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("spufs"); static int __init spufs_init(void) { diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 8538015ed4a..5f7d7ba2874 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -456,6 +456,7 @@ static struct file_system_type hypfs_type = { .mount = hypfs_mount, .kill_sb = hypfs_kill_super }; +MODULE_ALIAS_FS("s390_hypfs"); static const struct super_operations hypfs_s_ops = { .statfs = simple_statfs, diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 7320bf89170..3edade07b24 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -1234,6 +1234,7 @@ static struct file_system_type efivarfs_type = { .mount = efivarfs_mount, .kill_sb = efivarfs_kill_sb, }; +MODULE_ALIAS_FS("efivarfs"); /* * Handle negative dentry. diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index a479375a8fd..e0c404bdc4a 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -410,6 +410,7 @@ static struct file_system_type ipathfs_fs_type = { .mount = ipathfs_mount, .kill_sb = ipathfs_kill_super, }; +MODULE_ALIAS_FS("ipathfs"); int __init ipath_init_ipathfs(void) { diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 644bd6f6467..f247fc6e618 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -604,6 +604,7 @@ static struct file_system_type qibfs_fs_type = { .mount = qibfs_mount, .kill_sb = qibfs_kill_super, }; +MODULE_ALIAS_FS("ipathfs"); int __init qib_init_qibfs(void) { diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index 6673e578b3e..ce5b75616b4 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -110,6 +110,7 @@ static struct file_system_type ibmasmfs_type = { .mount = ibmasmfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("ibmasmfs"); static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent) { diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 82c06165d3d..92ab30ab00d 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -1238,6 +1238,7 @@ static struct file_system_type mtd_inodefs_type = { .mount = mtd_inodefs_mount, .kill_sb = kill_anon_super, }; +MODULE_ALIAS_FS("mtd_inodefs"); static int __init init_mtdchar(void) { diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 445ffda715a..7c12d9c2b23 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -276,6 +276,7 @@ static struct file_system_type oprofilefs_type = { .mount = oprofilefs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("oprofilefs"); int __init oprofilefs_register(void) diff --git a/drivers/staging/ccg/f_fs.c b/drivers/staging/ccg/f_fs.c index 8adc79d1b40..f6373dade7f 100644 --- a/drivers/staging/ccg/f_fs.c +++ b/drivers/staging/ccg/f_fs.c @@ -1223,6 +1223,7 @@ static struct file_system_type ffs_fs_type = { .mount = ffs_fs_mount, .kill_sb = ffs_fs_kill_sb, }; +MODULE_ALIAS_FS("functionfs"); /* Driver's main init/cleanup functions *************************************/ diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c index 38388d7844f..c377ff84bf2 100644 --- a/drivers/usb/gadget/f_fs.c +++ b/drivers/usb/gadget/f_fs.c @@ -1235,6 +1235,7 @@ static struct file_system_type ffs_fs_type = { .mount = ffs_fs_mount, .kill_sb = ffs_fs_kill_sb, }; +MODULE_ALIAS_FS("functionfs"); /* Driver's main init/cleanup functions *************************************/ diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index 8ac840f25ba..e2b2e9cf254 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -2105,6 +2105,7 @@ static struct file_system_type gadgetfs_type = { .mount = gadgetfs_mount, .kill_sb = gadgetfs_kill_sb, }; +MODULE_ALIAS_FS("gadgetfs"); /*----------------------------------------------------------------------*/ diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index ec0abb6df3c..71679875f05 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -75,6 +75,7 @@ static struct file_system_type xenfs_type = { .mount = xenfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("xenfs"); static int __init xenfs_init(void) { diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 91dad63e5a2..2756dcd5de6 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -365,3 +365,4 @@ struct file_system_type v9fs_fs_type = { .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("9p"); diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d5712293579..0ff4bae2c2a 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -524,6 +524,7 @@ static struct file_system_type adfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("adfs"); static int __init init_adfs_fs(void) { diff --git a/fs/affs/super.c b/fs/affs/super.c index b84dc735250..45161a832bb 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -622,6 +622,7 @@ static struct file_system_type affs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("affs"); static int __init init_affs_fs(void) { diff --git a/fs/afs/super.c b/fs/afs/super.c index 7c31ec39957..c4861557e38 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -45,6 +45,7 @@ struct file_system_type afs_fs_type = { .kill_sb = afs_kill_super, .fs_flags = 0, }; +MODULE_ALIAS_FS("afs"); static const struct super_operations afs_super_ops = { .statfs = afs_statfs, diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index cddc74b9cdb..b3db517e89e 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -26,6 +26,7 @@ static struct file_system_type autofs_fs_type = { .mount = autofs_mount, .kill_sb = autofs4_kill_sb, }; +MODULE_ALIAS_FS("autofs"); static int __init init_autofs4_fs(void) { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c8f4e25eb9e..8615ee89ab5 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -951,6 +951,7 @@ static struct file_system_type befs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("befs"); static int __init init_befs_fs(void) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 737aaa3f709..5e376bb9341 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -473,6 +473,7 @@ static struct file_system_type bfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("bfs"); static int __init init_bfs_fs(void) { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fecbbf3f8ff..751df5e4f61 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -720,6 +720,7 @@ static struct file_system_type bm_fs_type = { .mount = bm_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("binfmt_misc"); static int __init init_misc_binfmt(void) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 68a29a1ea06..f6b88595f85 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1558,6 +1558,7 @@ static struct file_system_type btrfs_fs_type = { .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("btrfs"); /* * used by btrfsctl to scan devices when no FS is mounted diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9fe17c6c287..6ddc0bca56b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -952,6 +952,7 @@ static struct file_system_type ceph_fs_type = { .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("ceph"); #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) diff --git a/fs/coda/inode.c b/fs/coda/inode.c index dada9d0abed..4dcc0d81a7a 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -329,4 +329,5 @@ struct file_system_type coda_fs_type = { .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("coda"); diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index aee0a7ebbd8..7f26c3cf75a 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -114,6 +114,7 @@ static struct file_system_type configfs_fs_type = { .mount = configfs_do_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("configfs"); struct dentry *configfs_pin_fs(void) { diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 3ceb9ec976e..35b1c7bd18b 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -573,6 +573,7 @@ static struct file_system_type cramfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("cramfs"); static int __init init_cramfs_fs(void) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0c4f80b447f..4888cb3fdef 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,6 +299,7 @@ static struct file_system_type debug_fs_type = { .mount = debug_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("debugfs"); static struct dentry *__create_file(const char *name, umode_t mode, struct dentry *parent, void *data, diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 073d30b9d1a..79b662985ef 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -510,6 +510,7 @@ static struct file_system_type devpts_fs_type = { .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, #endif }; +MODULE_ALIAS_FS("devpts"); /* * The normal naming convention is simply /dev/pts/; this conforms diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4e0886c9e5c..e924cf45aad 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -629,6 +629,7 @@ static struct file_system_type ecryptfs_fs_type = { .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; +MODULE_ALIAS_FS("ecryptfs"); /** * inode_info_init_once diff --git a/fs/efs/super.c b/fs/efs/super.c index 2002431ef9a..c6f57a74a55 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -33,6 +33,7 @@ static struct file_system_type efs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("efs"); static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 5e59280d42d..9d976332873 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -1010,6 +1010,7 @@ static struct file_system_type exofs_type = { .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; +MODULE_ALIAS_FS("exofs"); static int __init init_exofs(void) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7f68c811402..288534920fe 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1536,6 +1536,7 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); static int __init init_ext2_fs(void) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5546ca225ff..1d6e2ed8532 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -3068,6 +3068,7 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); static int __init init_ext3_fs(void) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e6c8783619..34e85521923 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -90,6 +90,7 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) @@ -104,6 +105,7 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) #else #define IS_EXT3_SB(sb) (0) @@ -5152,7 +5154,6 @@ static inline int ext2_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } @@ -5185,7 +5186,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } @@ -5199,6 +5199,7 @@ static struct file_system_type ext4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext4"); static int __init ext4_init_feat_adverts(void) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8c117649a03..fea6e582a2e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -687,6 +687,7 @@ static struct file_system_type f2fs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index e2cfda94a28..081b759cff8 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -668,6 +668,7 @@ static struct file_system_type msdos_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("msdos"); static int __init init_msdos_fs(void) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index ac959d655e7..2da952036a3 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1073,6 +1073,7 @@ static struct file_system_type vfat_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("vfat"); static int __init init_vfat_fs(void) { diff --git a/fs/filesystems.c b/fs/filesystems.c index da165f6adcb..92567d95ba6 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -273,7 +273,7 @@ struct file_system_type *get_fs_type(const char *name) int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); - if (!fs && (request_module("%.*s", len, name) == 0)) + if (!fs && (request_module("fs-%.*s", len, name) == 0)) fs = __get_fs_type(name, len); if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index fed2c8afb3a..45507430806 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -52,7 +52,6 @@ MODULE_AUTHOR("Christoph Hellwig"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */ static void vxfs_put_super(struct super_block *); @@ -258,6 +257,7 @@ static struct file_system_type vxfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ static int __init vxfs_init(void) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b7978b9f75e..a0b0855d00a 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -341,6 +341,7 @@ static struct file_system_type fuse_ctl_fs_type = { .mount = fuse_ctl_mount, .kill_sb = fuse_ctl_kill_sb, }; +MODULE_ALIAS_FS("fusectl"); int __init fuse_ctl_init(void) { diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index df00993ed10..137185c3884 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1117,6 +1117,7 @@ static struct file_system_type fuse_fs_type = { .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; +MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, @@ -1146,6 +1147,7 @@ static struct file_system_type fuseblk_fs_type = { .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; +MODULE_ALIAS_FS("fuseblk"); static inline int register_fuseblk(void) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1b612be4b87..60ede2a0f43 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1425,6 +1426,7 @@ struct file_system_type gfs2_fs_type = { .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; +MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", @@ -1432,4 +1434,4 @@ struct file_system_type gfs2meta_fs_type = { .mount = gfs2_mount_meta, .owner = THIS_MODULE, }; - +MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index e93ddaadfd1..bbaaa8a4ee6 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -466,6 +466,7 @@ static struct file_system_type hfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hfs"); static void hfs_init_once(void *p) { diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 974c26f96fa..7b87284e46d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -654,6 +654,7 @@ static struct file_system_type hfsplus_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hfsplus"); static void hfsplus_init_once(void *p) { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 74f55703be4..126d3c2e2de 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -748,6 +748,7 @@ static struct file_system_type hppfs_type = { .kill_sb = kill_anon_super, .fs_flags = 0, }; +MODULE_ALIAS_FS("hppfs"); static int __init init_hppfs(void) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7f94e0cbc69..84e3d856e91 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -896,6 +896,7 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 67ce52507d7..a67f16e846a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1556,6 +1556,7 @@ static struct file_system_type iso9660_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("iso9660"); static int __init init_iso9660_fs(void) { @@ -1593,5 +1594,3 @@ static void __exit exit_iso9660_fs(void) module_init(init_iso9660_fs) module_exit(exit_iso9660_fs) MODULE_LICENSE("GPL"); -/* Actual filesystem name is iso9660, as requested in filesystems.c */ -MODULE_ALIAS("iso9660"); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d3d8799e218..0defb1cc2a3 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -356,6 +356,7 @@ static struct file_system_type jffs2_fs_type = { .mount = jffs2_mount, .kill_sb = jffs2_kill_sb, }; +MODULE_ALIAS_FS("jffs2"); static int __init init_jffs2_fs(void) { diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 060ba638bec..2003e830ed1 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -833,6 +833,7 @@ static struct file_system_type jfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("jfs"); static void init_once(void *foo) { diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 345c24b8a6f..54360293bcb 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -608,6 +608,7 @@ static struct file_system_type logfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("logfs"); static int __init logfs_init(void) { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 99541cceb58..df122496f32 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -660,6 +660,7 @@ static struct file_system_type minix_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("minix"); static int __init init_minix_fs(void) { diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 7dafd6899a6..26910c8154d 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -1051,6 +1051,7 @@ static struct file_system_type ncp_fs_type = { .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("ncpfs"); static int __init init_ncp_fs(void) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 17b32b72245..95cdcb208df 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -294,6 +294,7 @@ struct file_system_type nfs_fs_type = { .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs"); EXPORT_SYMBOL_GPL(nfs_fs_type); struct file_system_type nfs_xdev_fs_type = { @@ -333,6 +334,7 @@ struct file_system_type nfs4_fs_type = { .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs4"); EXPORT_SYMBOL_GPL(nfs4_fs_type); static int __init register_nfs4_fs(void) @@ -2717,6 +2719,5 @@ module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); -MODULE_ALIAS("nfs4"); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 13a21c8fca4..f33455b4d95 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1090,6 +1090,7 @@ static struct file_system_type nfsd_fs_type = { .mount = nfsd_mount, .kill_sb = nfsd_umount, }; +MODULE_ALIAS_FS("nfsd"); #ifdef CONFIG_PROC_FS static int create_proc_exports_entry(void) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3c991dc84f2..c7d1f9f18b0 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1361,6 +1361,7 @@ struct file_system_type nilfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("nilfs2"); static void nilfs_inode_init_once(void *obj) { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 4a8289f8b16..82650d52d91 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3079,6 +3079,7 @@ static struct file_system_type ntfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ntfs"); /* Stable names for the slab caches. */ static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 4c5fc8d77dc..12bafb7265c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -640,6 +640,7 @@ static struct file_system_type dlmfs_fs_type = { .mount = dlmfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("ocfs2_dlmfs"); static int __init init_dlmfs_fs(void) { diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 25d715c7c87..d8b0afde217 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -572,6 +572,7 @@ static struct file_system_type omfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("omfs"); static int __init init_omfs_fs(void) { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ae47fa7efb9..75885ffde44 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -432,6 +432,7 @@ static struct file_system_type openprom_fs_type = { .mount = openprom_mount, .kill_sb = kill_anon_super, }; +MODULE_ALIAS_FS("openpromfs"); static void op_inode_init_once(void *data) { diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 43098bb5723..2e8caa62da7 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -412,6 +412,7 @@ static struct file_system_type qnx4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("qnx4"); static int __init init_qnx4_fs(void) { diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 57199a52a35..8d941edfefa 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -672,6 +672,7 @@ static struct file_system_type qnx6_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("qnx6"); static int __init init_qnx6_fs(void) { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 418bdc3a57d..194113b1b11 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2434,6 +2434,7 @@ struct file_system_type reiserfs_fs_type = { .kill_sb = reiserfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("reiserfs"); MODULE_DESCRIPTION("ReiserFS journaled filesystem"); MODULE_AUTHOR("Hans Reiser "); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 7e8d3a80bda..15cbc41ee36 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -599,6 +599,7 @@ static struct file_system_type romfs_fs_type = { .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("romfs"); /* * inode storage initialiser diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a38e87bdd78..a39938b1fee 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -545,6 +545,7 @@ static struct file_system_type sysv_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("sysv"); static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, @@ -553,6 +554,7 @@ static struct file_system_type v7_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("v7"); static int __init init_sysv_fs(void) { @@ -586,5 +588,4 @@ static void __exit exit_sysv_fs(void) module_init(init_sysv_fs) module_exit(exit_sysv_fs) -MODULE_ALIAS("v7"); MODULE_LICENSE("GPL"); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc0f6ae65e..ac838b84493 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2174,6 +2174,7 @@ static struct file_system_type ubifs_fs_type = { .mount = ubifs_mount, .kill_sb = kill_ubifs_super, }; +MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. diff --git a/fs/ufs/super.c b/fs/ufs/super.c index dc8e3a861d0..329f2f53b7e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1500,6 +1500,7 @@ static struct file_system_type ufs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ufs"); static int __init init_ufs_fs(void) { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c407121873b..ea341cea68c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1561,6 +1561,7 @@ static struct file_system_type xfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) diff --git a/include/linux/fs.h b/include/linux/fs.h index 74a907b8b95..2c28271ab9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1825,6 +1825,8 @@ struct file_system_type { struct lock_class_key i_mutex_dir_key; }; +#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) + extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_bdev(struct file_system_type *fs_type, diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 7b9b40224a2..a0f48a51e14 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1174,6 +1174,7 @@ static struct file_system_type rpc_pipe_fs_type = { .mount = rpc_mount, .kill_sb = rpc_kill_sb, }; +MODULE_ALIAS_FS("rpc_pipefs"); static void init_once(void *foo) @@ -1218,6 +1219,3 @@ void unregister_rpc_pipefs(void) kmem_cache_destroy(rpc_inode_cachep); unregister_filesystem(&rpc_pipe_fs_type); } - -/* Make 'mount -t rpc_pipefs ...' autoload this module. */ -MODULE_ALIAS("rpc_pipefs"); -- cgit v1.2.3-70-g09d2 From 357b66fdc8ad4cea6e6336956a70742f961f0a4d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:34:34 -0500 Subject: ext4: ext4_split_extent should take care of extent zeroout When ext4_split_extent_at() ends up doing zeroout & conversion to initialized instead of split & conversion, ext4_split_extent() gets confused and can wrongly mark the extent back as uninitialized resulting in end IO code getting confused from large unwritten extents and may result in data loss. The example of problematic behavior is: lblk len lblk len ext4_split_extent() (ex=[1000,30,uninit], map=[1010,10]) ext4_split_extent_at() (split [1000,30,uninit] at 1020) ext4_ext_insert_extent() -> ENOSPC ext4_ext_zeroout() -> extent [1000,30] is now initialized ext4_split_extent_at() (split [1000,30,init] at 1010, MARK_UNINIT1 | MARK_UNINIT2) -> extent is split and parts marked as uninitialized Fix the problem by rechecking extent type after the first ext4_split_extent_at() returns. None of split_flags can not be applied to initialized extent so this patch also add BUG_ON to prevent similar issues in future. TESTCASE: https://github.com/dmonakhov/xfstests/commit/b8a55eb5ce28c6ff29e620ab090902fcd5833597 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 372b2cbee07..bef194a1443 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2943,6 +2943,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3061,19 +3065,26 @@ static int ext4_split_extent(handle_t *handle, if (err) goto out; } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) -- cgit v1.2.3-70-g09d2 From ec22ba8edb507395c95fbc617eea26a6b2d98797 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:36:06 -0500 Subject: ext4: disable merging of uninitialized extents Derived from Jan's patch:http://permalink.gmane.org/gmane.comp.file-systems.ext4/36470 Merging of uninitialized extents creates all sorts of interesting race possibilities when writeback / DIO races with fallocate. Thus ext4_convert_unwritten_extents_endio() has to deal with a case where extent to be converted needs to be split out first. That isn't nice for two reasons: 1) It may need allocation of extent tree block so ENOSPC is possible. 2) It complicates end_io handling code So we disable merging of uninitialized extents which allows us to simplify the code. Extents will get merged after they are converted to initialized ones. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bef194a1443..60818ed1f6a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) -- cgit v1.2.3-70-g09d2 From ff95ec22cd7faa0d8b58dcc4207f21502df7b00b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:41:05 -0500 Subject: ext4: add warning to ext4_convert_unwritten_extents_endio Splitting extents inside endio is a bad thing, but unfortunately it is still possible. In fact we are pretty close to the moment when all related issues will be fixed. Let's warn developer if it still the case. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 60818ed1f6a..265cb0e50c5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3387,8 +3387,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) -- cgit v1.2.3-70-g09d2 From de99fcce1da7933a90198b80a2e896754ea3bdc8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Mar 2013 00:43:32 -0500 Subject: ext4: remove unnecessary wait for extent conversion in ext4_fallocate() Now that we don't merge uninitialized extents anymore, ext4_fallocate() is free to operate on the inode while there are still some extent conversions pending - it won't disturb them in any way. Reviewed-by: Zheng Liu Reviewed-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 265cb0e50c5..25c86aaa38d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4392,8 +4392,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; -- cgit v1.2.3-70-g09d2 From 6ca470d7b5e7639b7925b3202e796282703b6d5d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:50:47 -0500 Subject: ext4: invalidate extent status tree during extent migration mext_replace_branches() will change inode's extents layout so we have to drop corresponding cache. TESTCASE: 301'th xfstest was not yet accepted to official xfstest's branch and can be found here: https://github.com/dmonakhov/xfstests/commit/7b7efeee30a41109201e2040034e71db9b66ddc0 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/move_extent.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d78c33eed7e..c1f15b203e9 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -666,6 +666,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) -- cgit v1.2.3-70-g09d2 From 290502bee239062499297916bb7d21d205e99d62 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Feb 2013 00:39:37 -0800 Subject: eCryptfs: allow userspace messaging to be disabled When the userspace messaging (for the less common case of userspace key wrap/unwrap via ecryptfsd) is not needed, allow eCryptfs to build with it removed. This saves on kernel code size and reduces potential attack surface by removing the /dev/ecryptfs node. Signed-off-by: Kees Cook Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 8 ++++++++ fs/ecryptfs/Makefile | 7 +++++-- fs/ecryptfs/ecryptfs_kernel.h | 40 ++++++++++++++++++++++++++++++++++++++-- fs/ecryptfs/keystore.c | 4 ++-- include/linux/ecryptfs.h | 12 ++---------- 5 files changed, 55 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index cc16562654d..1f63120b669 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -12,3 +12,11 @@ config ECRYPT_FS To compile this file system support as a module, choose M here: the module will be called ecryptfs. + +config ECRYPT_FS_MESSAGING + bool "Enable notifications for userspace key wrap/unwrap" + depends on ECRYPT_FS + help + Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows + for userspace to wrap/unwrap file encryption keys by other + backends, like OpenSSL. diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 2cc9ee4ad2e..49678a69947 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -1,7 +1,10 @@ # -# Makefile for the Linux 2.6 eCryptfs +# Makefile for the Linux eCryptfs # obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o +ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \ + crypto.o keystore.o kthread.o debug.o + +ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index cfb4b9fed52..a9df69efadc 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -172,6 +172,19 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) +#ifdef CONFIG_ECRYPT_FS_MESSAGING +# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_PUBKEY) +#else +# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0 +#endif + +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_XATTR \ + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_MASK_MESSAGING \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; @@ -399,7 +412,9 @@ struct ecryptfs_daemon { struct hlist_node euid_chain; }; +#ifdef CONFIG_ECRYPT_FS_MESSAGING extern struct mutex ecryptfs_daemon_hash_mux; +#endif static inline size_t ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) @@ -604,6 +619,7 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); +#ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, struct ecryptfs_message *msg, u32 seq); int ecryptfs_send_message(char *data, int data_len, @@ -612,6 +628,24 @@ int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); int ecryptfs_init_messaging(void); void ecryptfs_release_messaging(void); +#else +static inline int ecryptfs_init_messaging(void) +{ + return 0; +} +static inline void ecryptfs_release_messaging(void) +{ } +static inline int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + return -ENOTCONN; +} +static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg) +{ + return -ENOMSG; +} +#endif void ecryptfs_write_header_metadata(char *virt, @@ -649,12 +683,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); -int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); -int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, size_t *packet_size_length); +#ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_init_ecryptfs_miscdev(void); void ecryptfs_destroy_ecryptfs_miscdev(void); int ecryptfs_send_miscdev(char *data, size_t data_size, @@ -663,6 +696,9 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); +#endif int ecryptfs_init_kthread(void); void ecryptfs_destroy_kthread(void); int ecryptfs_privileged_open(struct file **lower_file, diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 5aceff202dc..7d52806c211 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1168,7 +1168,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { ecryptfs_printk(KERN_ERR, "Error sending message to " - "ecryptfsd\n"); + "ecryptfsd: %d\n", rc); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1988,7 +1988,7 @@ pki_encrypt_session_key(struct key *auth_tok_key, rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { ecryptfs_printk(KERN_ERR, "Error sending message to " - "ecryptfsd\n"); + "ecryptfsd: %d\n", rc); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); diff --git a/include/linux/ecryptfs.h b/include/linux/ecryptfs.h index 2224a8c0cb6..8d5ab998a22 100644 --- a/include/linux/ecryptfs.h +++ b/include/linux/ecryptfs.h @@ -6,9 +6,8 @@ #define ECRYPTFS_VERSION_MINOR 0x04 #define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 /* These flags indicate which features are supported by the kernel - * module; userspace tools such as the mount helper read - * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine - * how to behave. */ + * module; userspace tools such as the mount helper read the feature + * bits from a sysfs handle in order to determine how to behave. */ #define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 #define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 #define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 @@ -19,13 +18,6 @@ #define ECRYPTFS_VERSIONING_HMAC 0x00000080 #define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 #define ECRYPTFS_VERSIONING_GCM 0x00000200 -#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ - | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ - | ECRYPTFS_VERSIONING_PUBKEY \ - | ECRYPTFS_VERSIONING_XATTR \ - | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC \ - | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH #define ECRYPTFS_SALT_SIZE 8 -- cgit v1.2.3-70-g09d2 From 9bf7a4890518186238d2579be16ecc5190a707c0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 1 Mar 2013 13:35:47 -0500 Subject: Btrfs: use set_nlink if our i_nlink is 0 We need to inc the nlink of deleted entries when running replay so we can do the unlink on the fs_root and get everything cleaned up and then have the orphan cleanup do the right thing. The problem is inc_nlink complains about this, even thought it still does the right thing. So use set_nlink() if our i_nlink is 0 to keep users from seeing the warnings during log replay. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c7ef569eb22..451fad96ecd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1382,7 +1382,10 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - btrfs_inc_nlink(inode); + if (!inode->i_nlink) + set_nlink(inode, 1); + else + btrfs_inc_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; -- cgit v1.2.3-70-g09d2 From aec8030a8745221c8658f2033b22c98528897b13 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 4 Mar 2013 09:44:29 +0000 Subject: Btrfs: fix wrong handle at error path of create_snapshot() when the commit fails There are several bugs at error path of create_snapshot() when the transaction commitment failed. - access the freed transaction handler. At the end of the transaction commitment, the transaction handler was freed, so we should not access it after the transaction commitment. - we were not aware of the error which happened during the snapshot creation if we submitted a async transaction commitment. - pending snapshot access vs pending snapshot free. when something wrong happened after we submitted a async transaction commitment, the transaction committer would cleanup the pending snapshots and free them. But the snapshot creators were not aware of it, they would access the freed pending snapshots. This patch fixes the above problems by: - remove the dangerous code that accessed the freed handler - assign ->error if the error happens during the snapshot creation - the transaction committer doesn't free the pending snapshots, just assigns the error number and evicts them before we unblock the transaction. Reported-by: Dan Carpenter Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 16 ++++++-------- fs/btrfs/ioctl.c | 6 +----- fs/btrfs/transaction.c | 58 ++++++++++++++++++++++++++++---------------------- 3 files changed, 41 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02369a3c162..7d84651e850 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -3687,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3700,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) snapshot = list_entry(splice.next, struct btrfs_pending_snapshot, list); - + snapshot->error = -ECANCELED; list_del_init(&snapshot->list); - - kfree(snapshot); } } @@ -3840,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->blocked = 1; wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(cur_trans); + cur_trans->blocked = 0; wake_up(&root->fs_info->transaction_wait); @@ -3849,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(cur_trans); - btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(root, @@ -3894,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(t); + t->blocked = 0; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) @@ -3907,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(t); - btrfs_destroy_delalloc_inodes(root); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b908960c974..94c0e42dfa1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -596,12 +596,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - if (ret) { - /* cleanup_transaction has freed this for us */ - if (trans->aborted) - pending_snapshot = NULL; + if (ret) goto fail; - } ret = pending_snapshot->error; if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f11c2e0a374..d8fce6fe9cf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1053,7 +1053,12 @@ int btrfs_defrag_root(struct btrfs_root *root) /* * new snapshots need to be created at a very specific time in the - * transaction commit. This does the actual creation + * transaction commit. This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1072,7 +1077,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct extent_buffer *tmp; struct extent_buffer *old; struct timespec cur_time = CURRENT_TIME; - int ret; + int ret = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -1081,40 +1086,36 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { - ret = pending->error = -ENOMEM; - return ret; + pending->error = -ENOMEM; + return 0; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = pending->error = -ENOMEM; + pending->error = -ENOMEM; goto root_item_alloc_fail; } - ret = btrfs_find_free_objectid(tree_root, &objectid); - if (ret) { - pending->error = ret; + pending->error = btrfs_find_free_objectid(tree_root, &objectid); + if (pending->error) goto no_free_objectid; - } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve, - BTRFS_RESERVE_NO_FLUSH); - if (ret) { - pending->error = ret; + pending->error = btrfs_block_rsv_add(root, + &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); + if (pending->error) goto no_free_objectid; - } } - ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - pending->error = ret; + pending->error = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (pending->error) goto no_free_objectid; - } key.objectid = objectid; key.offset = (u64)-1; @@ -1142,7 +1143,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; - goto fail; + goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); btrfs_abort_transaction(trans, root, ret); @@ -1273,6 +1274,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: + pending->error = ret; +dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; no_free_objectid: @@ -1288,12 +1291,17 @@ root_item_alloc_fail: static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_pending_snapshot *pending; + struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; + int ret = 0; - list_for_each_entry(pending, head, list) - create_pending_snapshot(trans, fs_info, pending); - return 0; + list_for_each_entry_safe(pending, next, head, list) { + list_del(&pending->list); + ret = create_pending_snapshot(trans, fs_info, pending); + if (ret) + break; + } + return ret; } static void update_super_roots(struct btrfs_root *root) -- cgit v1.2.3-70-g09d2 From 00d71c9c17b1fd28fa54f323a29a0e23c6d3de40 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 4 Mar 2013 09:45:06 +0000 Subject: Btrfs: fix unclosed transaction handler when the async transaction commitment fails If the async transaction commitment failed, we need close the current transaction handler, or the current transaction will be blocked to commit because of this orphan handler. We fix the problem by doing sync transaction commitment, that is to invoke btrfs_commit_transaction(). Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 94c0e42dfa1..3fdfabcc1aa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -527,6 +527,8 @@ fail: if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); + if (err) + err = btrfs_commit_transaction(trans, root); } else { err = btrfs_commit_transaction(trans, root); } @@ -592,6 +594,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, root->fs_info->extent_root, 1); + if (ret) + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); -- cgit v1.2.3-70-g09d2 From 8f71f3e0e4fb5a2445fb93d3057a33aefc4aa30d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 4 Mar 2013 16:25:36 +0000 Subject: Btrfs: check for NULL pointer in updating reloc roots Add a check for NULL pointer to avoid invalid reference. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ba5a3210da9..16e0c6fbdbe 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1269,6 +1269,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) } spin_unlock(&rc->reloc_root_tree.lock); + if (!node) + return 0; BUG_ON((struct btrfs_root *)node->data != root); if (!del) { -- cgit v1.2.3-70-g09d2 From aca1bba6f9bc550a33312b28e98b3de5ddb3bb15 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 4 Mar 2013 16:25:37 +0000 Subject: Btrfs: build up error handling for merge_reloc_roots We first use btrfs_std_error hook to replace with BUG_ON, and we also need to cleanup what is left, including reloc roots rbtree and reloc roots list. Here we use a helper function to cleanup both rbtree and list, and since this function can also be used in the balance recover path, we also make the change as well to keep code simple. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 16e0c6fbdbe..0f001c14eaf 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2239,6 +2239,21 @@ again: return err; } +static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ + struct btrfs_root *reloc_root; + + while (!list_empty(list)) { + reloc_root = list_entry(list->next, struct btrfs_root, + root_list); + __update_reloc_root(reloc_root, 1); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } +} + static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { @@ -2246,7 +2261,7 @@ int merge_reloc_roots(struct reloc_control *rc) struct btrfs_root *reloc_root; LIST_HEAD(reloc_roots); int found = 0; - int ret; + int ret = 0; again: root = rc->extent_root; @@ -2272,20 +2287,33 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); - BUG_ON(ret); + if (ret) + goto out; } else { list_del_init(&reloc_root->root_list); } ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); - BUG_ON(ret < 0); + if (ret < 0) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } } if (found) { found = 0; goto again; } +out: + if (ret) { + btrfs_std_error(root->fs_info, ret); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + } + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return 0; + return ret; } static void free_block_list(struct rb_root *blocks) @@ -4266,14 +4294,9 @@ int btrfs_recover_relocation(struct btrfs_root *root) out_free: kfree(rc); out: - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); - } + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + btrfs_free_path(path); if (err == 0) { -- cgit v1.2.3-70-g09d2 From e1a1267054ea9dcafd01636e52d441f809efad5e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 4 Mar 2013 16:25:38 +0000 Subject: Btrfs: free all recorded tree blocks on error We've missed the 'free blocks' part on ENOMEM error. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0f001c14eaf..7d1654cf9d4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2848,8 +2848,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, int err = 0; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + err = -ENOMEM; + goto out_path; + } rb_node = rb_first(blocks); while (rb_node) { @@ -2888,10 +2890,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, rb_node = rb_next(rb_node); } out: - free_block_list(blocks); err = finish_pending_nodes(trans, rc, path, err); btrfs_free_path(path); +out_path: + free_block_list(blocks); return err; } -- cgit v1.2.3-70-g09d2 From 288189471d7e12fb22c37870e151833f438deea8 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 4 Mar 2013 16:25:39 +0000 Subject: Btrfs: do not BUG_ON in prepare_to_reloc We can bail out from here gracefully instead of a cold BUG_ON. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7d1654cf9d4..9d13786eec7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3731,7 +3731,15 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + /* + * extent tree is not a ref_cow tree and has no reloc_root to + * cleanup. And callers are responsible to free the above + * block rsv. + */ + return PTR_ERR(trans); + } btrfs_commit_transaction(trans, rc->extent_root); return 0; } -- cgit v1.2.3-70-g09d2 From 0f788c58194e4ccc5b3ab23f872c5e18542335e4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 4 Mar 2013 16:25:40 +0000 Subject: Btrfs: do not BUG_ON on aborted situation Btrfs balance can easily hit BUG_ON in these places, but we want to it bail out gracefully after we force the whole filesystem to readonly. So we use btrfs_std_error hook in place of BUG_ON. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 6 +++++- fs/btrfs/volumes.c | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9d13786eec7..3ebe87977aa 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3771,7 +3771,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { progress++; trans = btrfs_start_transaction(rc->extent_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + break; + } restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 35bb2d4ed29..9ff454df675 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2379,7 +2379,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } lock_chunks(root); @@ -3050,7 +3054,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); - BUG_ON(ret); + if (ret) + btrfs_std_error(fs_info, ret); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } -- cgit v1.2.3-70-g09d2 From 66b6135b7cf741f6f44ba938b27583ea3b83bd12 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 4 Mar 2013 16:25:41 +0000 Subject: Btrfs: avoid deadlock on transaction waiting list Only let one trans handle to wait for other handles, otherwise we will get ABBA issues. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d8fce6fe9cf..fedede1fe17 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1457,6 +1457,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, err); spin_lock(&root->fs_info->trans_lock); + + if (list_empty(&cur_trans->list)) { + spin_unlock(&root->fs_info->trans_lock); + btrfs_end_transaction(trans, root); + return; + } + list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { root->fs_info->trans_no_join = 1; -- cgit v1.2.3-70-g09d2 From 9b53157aac7366cea413ee29b629f83225829e87 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 4 Mar 2013 17:28:38 +0000 Subject: Btrfs: allow running defrag in parallel to administrative tasks Commit 5ac00add added a testnset mutex and code that disallows running administrative tasks in parallel. It is prevented that the device add/delete/balance/replace/resize operations are started in parallel. By mistake, the defragmentation operation was included in the check for mutually exclusiveness as well. This is fixed with this commit. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3fdfabcc1aa..898c5729e7e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2245,13 +2245,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; @@ -2306,7 +2299,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); mnt_drop_write_file(file); return ret; } -- cgit v1.2.3-70-g09d2 From 154ea2893002618bc3f9a1e2d8186c65490968b1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 5 Mar 2013 11:11:26 -0500 Subject: Btrfs: enforce min_bytes parameter during extent allocation Commit 24542bf7ea5e4fdfdb5157ff544c093fa4dcb536 changed preallocation of extents to cap the max size we try to allocate. It's a valid change, but the extent reservation code is also used by balance, and that can't tolerate a smaller extent being allocated. __btrfs_prealloc_file_range already has a min_size parameter, which is used by relocation to request a specific extent size. This commit adds an extra check to enforce that minimum extent size. Signed-off-by: Chris Mason Reported-by: Stefan Behrens --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ecd9c4cdb0d..13ab4de0a40 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8502,6 +8502,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_key ins; u64 cur_offset = start; u64 i_size; + u64 cur_bytes; int ret = 0; bool own_trans = true; @@ -8516,8 +8517,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, - min(num_bytes, 256ULL * 1024 * 1024), + cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = max(cur_bytes, min_size); + ret = btrfs_reserve_extent(trans, root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) -- cgit v1.2.3-70-g09d2 From 72d282dc5109e5dc0d963be020604e0cc82f7ed7 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 5 Mar 2013 19:25:55 +0000 Subject: cifs: Fix bug when checking error condition in cifs_rename_pending_delete() Fix check for error condition after setting attributes with CIFSSMBSetFileInfo(). Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 83f2606c76d..e7ae45cd94e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1023,7 +1023,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, current->tgid); /* although we would like to mark the file hidden if that fails we will still try to rename it */ - if (rc != 0) + if (!rc) cifsInode->cifsAttrs = dosattr; else dosattr = origattr; /* since not able to change them */ -- cgit v1.2.3-70-g09d2 From c483a9841df1de327e01af7deb6ba349210e5f82 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 5 Mar 2013 19:25:56 +0000 Subject: cifs: Check server capability before attempting silly rename cifs_rename_pending_delete() attempts to silly rename file using CIFSSMBRenameOpenFile(). This uses the SET_FILE_INFORMATION TRANS2 command with information level set to the passthru info-level SMB_SET_FILE_RENAME_INFORMATION. We need to check to make sure that the server support passthru info-levels before attempting the silly rename or else we will fail to rename the file. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e7ae45cd94e..0079696305c 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -995,6 +995,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + /* + * We cannot rename the file if the server doesn't support + * CAP_INFOLEVEL_PASSTHRU + */ + if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) { + rc = -EBUSY; + goto out; + } + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, -- cgit v1.2.3-70-g09d2 From 25189643a165a5ccad48ee3c116b55905299fd3d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Feb 2013 06:32:59 -0500 Subject: cifs: remove the sockopt= mount option ...as promised for 3.9. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 54125e04fd0..991c63c6bdd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,7 +97,7 @@ enum { Opt_user, Opt_pass, Opt_ip, Opt_unc, Opt_domain, Opt_srcaddr, Opt_prefixpath, - Opt_iocharset, Opt_sockopt, + Opt_iocharset, Opt_netbiosname, Opt_servern, Opt_ver, Opt_vers, Opt_sec, Opt_cache, @@ -202,7 +202,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_srcaddr, "srcaddr=%s" }, { Opt_prefixpath, "prefixpath=%s" }, { Opt_iocharset, "iocharset=%s" }, - { Opt_sockopt, "sockopt=%s" }, { Opt_netbiosname, "netbiosname=%s" }, { Opt_servern, "servern=%s" }, { Opt_ver, "ver=%s" }, @@ -1752,19 +1751,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, */ cFYI(1, "iocharset set to %s", string); break; - case Opt_sockopt: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - - if (strnicmp(string, "TCP_NODELAY", 11) == 0) { - printk(KERN_WARNING "CIFS: the " - "sockopt=TCP_NODELAY option has been " - "deprecated and will be removed " - "in 3.9\n"); - vol->sockopt_tcp_nodelay = 1; - } - break; case Opt_netbiosname: string = match_strdup(args); if (string == NULL) -- cgit v1.2.3-70-g09d2 From 94e18007688a13e5da1a2f1b7f52f15cc56c9f5e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 4 Mar 2013 15:18:25 -0500 Subject: cifs: don't try to unlock pagecache page after releasing it We had a recent fix to fix the release of pagecache pages when cifs_writev_requeue writes fail. Unfortunately, it releases the page before trying to unlock it. At that point, the page might be gone by the time the unlock comes in. Unlock the page first before checking the value of "rc", and only then end writeback and release the pages. The page lock isn't required for any of those operations so this should be safe. Reported-by: Anton Altaparmakov Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7353bc5d73d..8e2e799e7a2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1909,12 +1909,12 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { + unlock_page(wdata->pages[i]); if (rc != 0) { SetPageError(wdata->pages[i]); end_page_writeback(wdata->pages[i]); page_cache_release(wdata->pages[i]); } - unlock_page(wdata->pages[i]); } mapping_set_error(inode->i_mapping, rc); -- cgit v1.2.3-70-g09d2 From 067785c40e52089993757afa28988c05f3cb2694 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 6 Mar 2013 19:38:36 +0400 Subject: CIFS: Fix missing of oplock_read value in smb30_values structure Cc: stable@vger.kernel.org Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c9c7aa7ed96..bceffe7b8f8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -744,4 +744,5 @@ struct smb_version_values smb30_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; -- cgit v1.2.3-70-g09d2 From 3a01aa7a25274308fe813a6237f678aed901cea3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 6 Mar 2013 01:57:55 -0700 Subject: Btrfs: fix a mismerge in btrfs_balance() Raid56 merge (merge commit e942f88) had mistakenly removed a call to __cancel_balance(), which resulted in balance not cleaning up after itself after a successful finish. (Cleanup includes switching the state, removing the balance item and releasing mut_ex_op testnset lock.) Bring it back. Reported-by: David Sterba Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9ff454df675..6b9cff42265 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3235,6 +3235,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, update_ioctl_balance_args(fs_info, 0, bargs); } + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + wake_up(&fs_info->balance_wait_q); return ret; -- cgit v1.2.3-70-g09d2 From 9141770548d529b9d32d5b08d59b65ee65afe0d4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Mar 2013 01:08:55 -0800 Subject: fs: Limit sys_mount to only request filesystem modules (Part 2). Add missing MODULE_ALIAS_FS("ocfs2") how did I miss that? Remove unnecessary MODULE_ALIAS_FS("devpts") devpts can not be modular. Signed-off-by: "Eric W. Biederman" --- fs/devpts/inode.c | 1 - fs/ocfs2/super.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 79b662985ef..073d30b9d1a 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -510,7 +510,6 @@ static struct file_system_type devpts_fs_type = { .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, #endif }; -MODULE_ALIAS_FS("devpts"); /* * The normal naming convention is simply /dev/pts/; this conforms diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 9b6910dec4b..01b85165552 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1266,6 +1266,7 @@ static struct file_system_type ocfs2_fs_type = { .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; +MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) -- cgit v1.2.3-70-g09d2 From de3cb945db4d8eb3b046dc7a5ea89a893372750c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 4 Mar 2013 17:13:31 -0500 Subject: Btrfs: improve the delayed inode throttling The delayed inode code batches up changes to the btree in hopes of doing them in bulk. As the changes build up, processes kick off worker threads and wait for them to make progress. The current code kicks off an async work queue item for each delayed node, which creates a lot of churn. It also uses a fixed 1 HZ waiting period for the throttle, which allows us to build a lot of pending work and can slow down the commit. This changes us to watch a sequence counter as it is bumped during the operations. We kick off fewer work items and have each work item do more work. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 151 ++++++++++++++++++++++++++++------------------- fs/btrfs/delayed-inode.h | 2 + 2 files changed, 92 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b278b117cb..14fce27b478 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -22,8 +22,9 @@ #include "disk-io.h" #include "transaction.h" -#define BTRFS_DELAYED_WRITEBACK 400 -#define BTRFS_DELAYED_BACKGROUND 100 +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 static struct kmem_cache *delayed_node_cache; @@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, BTRFS_DELAYED_DELETION_ITEM); } +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(&delayed_root->items_seq); + if ((atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); +} + static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct rb_root *root; @@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; - if (atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + + finish_one_item(delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -1056,10 +1064,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; - if (atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + finish_one_item(delayed_root); } } @@ -1304,35 +1309,44 @@ void btrfs_remove_delayed_node(struct inode *inode) btrfs_release_delayed_node(delayed_node); } -struct btrfs_async_delayed_node { - struct btrfs_root *root; - struct btrfs_delayed_node *delayed_node; +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; struct btrfs_work work; }; -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +static void btrfs_async_run_delayed_root(struct btrfs_work *work) { - struct btrfs_async_delayed_node *async_node; + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - int need_requeue = 0; + int total_done = 0; - async_node = container_of(work, struct btrfs_async_delayed_node, work); + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work->delayed_root; path = btrfs_alloc_path(); if (!path) goto out; - path->leave_spinning = 1; - delayed_node = async_node->delayed_node; +again: + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) + goto free_path; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + goto free_path; + + path->leave_spinning = 1; root = delayed_node->root; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) - goto free_path; + goto release_path; block_rsv = trans->block_rsv; trans->block_rsv = &root->fs_info->delayed_block_rsv; @@ -1363,57 +1377,47 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) * Task1 will sleep until the transaction is commited. */ mutex_lock(&delayed_node->mutex); - if (delayed_node->count) - need_requeue = 1; - else - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, - delayed_node); + btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); btrfs_btree_balance_dirty_nodelay(root); + +release_path: + btrfs_release_path(path); + total_done++; + + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work->nr == 0 || total_done < async_work->nr) + goto again; + free_path: btrfs_free_path(path); out: - if (need_requeue) - btrfs_requeue_work(&async_node->work); - else { - btrfs_release_prepared_delayed_node(delayed_node); - kfree(async_node); - } + wake_up(&delayed_root->wait); + kfree(async_work); } + static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int all) + struct btrfs_root *root, int nr) { - struct btrfs_async_delayed_node *async_node; - struct btrfs_delayed_node *curr; - int count = 0; + struct btrfs_async_delayed_work *async_work; -again: - curr = btrfs_first_prepared_delayed_node(delayed_root); - if (!curr) + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return 0; - async_node = kmalloc(sizeof(*async_node), GFP_NOFS); - if (!async_node) { - btrfs_release_prepared_delayed_node(curr); + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) return -ENOMEM; - } - - async_node->root = root; - async_node->delayed_node = curr; - - async_node->work.func = btrfs_async_run_delayed_node_done; - async_node->work.flags = 0; - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); - count++; - - if (all || count < 4) - goto again; + async_work->delayed_root = delayed_root; + async_work->work.func = btrfs_async_run_delayed_root; + async_work->work.flags = 0; + async_work->nr = nr; + btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); return 0; } @@ -1424,30 +1428,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } +static int refs_newer(struct btrfs_delayed_root *delayed_root, + int seq, int count) +{ + int val = atomic_read(&delayed_root->items_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; + seq = atomic_read(&delayed_root->items_seq); + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { int ret; - ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); + DEFINE_WAIT(__wait); + + ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - wait_event_interruptible_timeout( - delayed_root->wait, - (atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND), - HZ); - return; + while (1) { + prepare_to_wait(&delayed_root->wait, &__wait, + TASK_INTERRUPTIBLE); + + if (refs_newer(delayed_root, seq, + BTRFS_DELAYED_BATCH) || + atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND) { + break; + } + if (!signal_pending(current)) + schedule(); + else + break; + } + finish_wait(&delayed_root->wait, &__wait); } - btrfs_wq_run_delayed_node(delayed_root, root, 0); + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 78b6ad0fc66..1d5c5f7abe3 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -43,6 +43,7 @@ struct btrfs_delayed_root { */ struct list_head prepare_list; atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ int nodes; /* for delayed nodes */ wait_queue_head_t wait; }; @@ -86,6 +87,7 @@ static inline void btrfs_init_delayed_root( struct btrfs_delayed_root *delayed_root) { atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); delayed_root->nodes = 0; spin_lock_init(&delayed_root->lock); init_waitqueue_head(&delayed_root->wait); -- cgit v1.2.3-70-g09d2 From 7b54c165a0c012edbaeaa73c5c87cb73721eb580 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Mar 2013 09:03:07 -0800 Subject: vfs: don't BUG_ON() if following a /proc fd pseudo-symlink results in a symlink It's "normal" - it can happen if the file descriptor you followed was opened with O_NOFOLLOW. Reported-by: Dave Jones Cc: Al Viro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/namei.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 961bc126836..57ae9c8c66b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -689,8 +689,6 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; - - BUG_ON(nd->inode->i_op->follow_link); } static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) -- cgit v1.2.3-70-g09d2 From db04dc679bcc780ad6907943afe24a30de974a1b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 9 Mar 2013 00:14:45 -0800 Subject: proc: Use nd_jump_link in proc_ns_follow_link Update proc_ns_follow_link to use nd_jump_link instead of just manually updating nd.path.dentry. This fixes the BUG_ON(nd->inode != parent->d_inode) reported by Dave Jones and reproduced trivially with mkdir /proc/self/ns/uts/a. Sigh it looks like the VFS change to require use of nd_jump_link happend while proc_ns_follow_link was baking and since the common case of proc_ns_follow_link continued to work without problems the need for making this change was overlooked. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- fs/proc/namespaces.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b7a47196c8c..66b51c0383d 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -118,7 +118,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) struct super_block *sb = inode->i_sb; struct proc_inode *ei = PROC_I(inode); struct task_struct *task; - struct dentry *ns_dentry; + struct path ns_path; void *error = ERR_PTR(-EACCES); task = get_proc_task(inode); @@ -128,14 +128,14 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); - if (IS_ERR(ns_dentry)) { - error = ERR_CAST(ns_dentry); + ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + if (IS_ERR(ns_path.dentry)) { + error = ERR_CAST(ns_path.dentry); goto out_put_task; } - dput(nd->path.dentry); - nd->path.dentry = ns_dentry; + ns_path.mnt = mntget(nd->path.mnt); + nd_jump_link(nd, &ns_path); error = NULL; out_put_task: -- cgit v1.2.3-70-g09d2 From bd384364c1185ecb01f3b8242c915ccb5921c60d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 20:48:59 -0400 Subject: ext4: avoid a potential overflow in ext4_es_can_be_merged() Check the length of an extent to avoid a potential overflow in ext4_es_can_be_merged(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents_status.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b752..37f9a2d8fd0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (es1->es_lblk + es1->es_len != es2->es_lblk) + if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) return 0; - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; - return 1; + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; } static struct extent_status * -- cgit v1.2.3-70-g09d2 From 921f266bc6bfe6ebb599c559f10443af314c19ec Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 10 Mar 2013 21:01:03 -0400 Subject: ext4: add self-testing infrastructure to do a sanity check This commit adds a self-testing infrastructure like extent tree does to do a sanity check for extent status tree. After status tree is as a extent cache, we'd better to make sure that it caches right result. After applied this commit, we will get a lot of messages when we run xfstests as below. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... kernel: ES cache assertation failed for inode: 230 es_cached ex [974/2/4781/20] != found ex [974/1/4781/1000] ... kernel: ES insert assertation failed for inode: 635 ex_status [0/45/21388/w] != es_status [44/1/21432/u] ... Signed-off-by: Dmitry Monakhov Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 6 ++ fs/ext4/inode.c | 96 ++++++++++++++++++++++++++ 3 files changed, 277 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 37f9a2d8fd0..d2a8cb74676 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -399,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) return es; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -481,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_status(&newes, status); trace_ext4_es_insert_extent(inode, &newes); + ext4_es_insert_extent_check(inode, &newes); + write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); if (err != 0) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969d..56140ad4150 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,6 +20,12 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + /* * These flags live in the high bits of extent_status.es_pblk */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 95a0c62c568..3186a43fa4b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -482,6 +482,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -509,6 +561,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +588,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG_ON(1); } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif goto found; } @@ -551,6 +612,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +713,15 @@ found: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -1768,6 +1847,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1809,6 +1893,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, else BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif return retval; } @@ -1873,6 +1960,15 @@ add_delayed: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3-70-g09d2 From cdee78433c138c2f2018a6884673739af2634787 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:08:52 -0400 Subject: ext4: fix wrong m_len value after unwritten extent conversion The ext4_ext_handle_uninitialized_extents() function was assuming the return value of ext4_ext_map_blocks() is equal to map->m_len. This incorrect assumption was harmless until we started use status tree as a extent cache because we need to update status tree according to 'm_len' value. Meanwhile this commit marks EXT4_MAP_MAPPED flag after unwritten extent conversion. It shouldn't cause a bug because we update status tree according to checking EXT4_MAP_UNWRITTEN flag. But it should be fixed. After applied this commit, the following error message from self-testing infrastructure disappears. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 25c86aaa38d..110e85a1f82 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3650,6 +3650,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ -- cgit v1.2.3-70-g09d2 From adb2355104b2109e06ba5276485d187d023b2fd2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:13:05 -0400 Subject: ext4: update extent status tree after an extent is zeroed out When we try to split an extent, this extent could be zeroed out and mark as initialized. But we don't know this in ext4_map_blocks because it only returns a length of allocated extent. Meanwhile we will mark this extent as uninitialized because we only check m_flags. This commit update extent status tree when we try to split an unwritten extent. We don't need to worry about the status of this extent because we always mark it as initialized. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 35 +++++++++++++++++++++++++++++++---- fs/ext4/extents_status.c | 17 +++++++++++++++++ fs/ext4/extents_status.h | 3 +++ fs/ext4/inode.c | 10 ++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 110e85a1f82..7e37018d175 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2925,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2996,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3009,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3150,6 +3170,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3247,6 +3268,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3305,6 +3329,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d2a8cb74676..fe3337a85ed 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -854,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 56140ad4150..d8e2d4dc311 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,8 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_extent; + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ @@ -64,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3186a43fa4b..4f1d54a88d8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -722,6 +722,15 @@ found: } #endif + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -734,6 +743,7 @@ found: retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); -- cgit v1.2.3-70-g09d2 From 3a2256702e47f68f921dfad41b1764d05c572329 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:20:23 -0400 Subject: ext4: fix the wrong number of the allocated blocks in ext4_split_extent() This commit fixes a wrong return value of the number of the allocated blocks in ext4_split_extent. When the length of blocks we want to allocate is greater than the length of the current extent, we return a wrong number. Let's see what happens in the following case when we call ext4_split_extent(). map: [48, 72] ex: [32, 64, u] 'ex' will be split into two parts: ex1: [32, 47, u] ex2: [48, 64, w] 'map->m_len' is returned from this function, and the value is 24. But the real length is 16. So it should be fixed. Meanwhile in this commit we use right length of the allocated blocks when get_reserved_cluster_alloc in ext4_ext_handle_uninitialized_extents is called. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e37018d175..69df02ff96a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3067,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3086,6 +3087,8 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may @@ -3115,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3730,6 +3733,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already -- cgit v1.2.3-70-g09d2 From e1c36595bedc2e1b4112f01256cb30f4d9f9ae46 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 10 Mar 2013 22:19:00 -0400 Subject: ext4: fix WARN_ON from ext4_releasepage() ext4_releasepage() warns when it is passed a page with PageChecked set. However this can correctly happen when invalidate_inode_pages2_range() invalidates pages - and we should fail the release in that case. Since the page was dirty anyway, it won't be discarded and no harm has happened but it's good to be safe. Also remove bogus page_has_buffers() check - we are guaranteed page has buffers in this function. Reported-by: Zheng Liu Tested-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4f1d54a88d8..117a9e7aa4a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3018,8 +3018,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); -- cgit v1.2.3-70-g09d2 From e3d85c366089015805f175324bb1780249f44669 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:21:49 -0400 Subject: ext4: remove unused variable in ext4_free_blocks() Remove unused variable 'freed' in ext4_free_blocks(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe..75e05f3a730 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4464,7 +4464,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4672,8 +4671,6 @@ do_more: ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); -- cgit v1.2.3-70-g09d2 From bb8b20ed94bc69120e31399c43cb336300dea109 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:28:09 -0400 Subject: ext4: do not use yield() Using yield() is strongly discouraged (see sched/core.c) especially since we can just use cond_resched(). Replace all use of yield() with cond_resched(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 117a9e7aa4a..48fc023ab0a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1352,7 +1352,7 @@ repeat: ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 75e05f3a730..8b2ea9f7500 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3692,11 +3692,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { -- cgit v1.2.3-70-g09d2 From 232ec8720d4e45405e37144c67053042c6b886d3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:46:30 -0400 Subject: ext4: update reserved space after the 'correction' Currently in ext4_ext_map_blocks() in delayed allocation writeback we would update the reservation and after that check whether we claimed cluster outside of the range of the allocation and if so, we'll give the block back to the reservation pool. However this also means that if the number of reserved data block dropped to zero before the correction, we would release all the metadata reservation as well, however we might still need it because the we're not done with the delayed allocation and there might be more blocks to come. This will result in error messages such as: EXT4-fs warning (device sdb): ext4_da_update_reserve_space:361: ino 12, allocated 1 with only 0 reserved metadata blocks (releasing 1 blocks with reserved 1 data blocks) This will only happen on bigalloc file system and it can be easily reproduced using fiemap-tester from xfstests like this: ./src/fiemap-tester -m DHDHDHDHD -S -p0 /mnt/test/file Or using xfstests such as 225. Fix this by doing the correction first and updating the reservation after that so that we do not accidentally decrease i_reserved_data_blocks to zero. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 69df02ff96a..bd69e906bd9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4165,9 +4165,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4218,6 +4215,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } -- cgit v1.2.3-70-g09d2 From 386ad67c9ac043890121c066186883d1640348a4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:50:00 -0400 Subject: ext4: reserve metadata block for every delayed write Currently we only reserve space (data+metadata) in delayed allocation if we're allocating from new cluster (which is always in non-bigalloc file system) which is ok for data blocks, because we reserve the whole cluster. However we have to reserve metadata for every delayed block we're going to write because every block could potentially require metedata block when we need to grow the extent tree. Signed-off-by: Lukas Czerner --- fs/ext4/inode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 48fc023ab0a..65bbc9339ac 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1304,6 +1304,55 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } +/* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + /* * Reserve a single cluster located at lblock */ @@ -1940,8 +1989,11 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { @@ -1949,6 +2001,13 @@ add_delayed: retval = ret; goto out_unlock; } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3-70-g09d2 From bc077320f8f6449197f4010d6774afab7bb998b2 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 20 Oct 2012 12:02:59 +0200 Subject: hostfs: fix a not needed double check With the commit 3be2be0a32c18b0fd6d623cda63174a332ca0de1 we removed vmtruncate, but actaully there is no need to call inode_newsize_ok() because the checks are already done in inode_change_ok() at the begin of the function. Signed-off-by: Marco Stornelli Signed-off-by: Richard Weinberger --- fs/hostfs/hostfs_kern.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fbabb906066..178b90c229b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -845,15 +845,8 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = inode_newsize_ok(inode, attr->ia_size); - if (error) - return error; - + attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - } setattr_copy(inode, attr); mark_inode_dirty(inode); -- cgit v1.2.3-70-g09d2 From 3e64fe5b21852375f2f53e7244ba697f1fee2fcf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Mar 2013 07:05:42 -0700 Subject: fs: Limit sys_mount to only request filesystem modules. (Part 3) Somehow I failed to add the MODULE_ALIAS_FS for cifs, hostfs, hpfs, squashfs, and udf despite what I thought were my careful checks :( Add them now. Signed-off-by: "Eric W. Biederman" --- fs/cifs/cifsfs.c | 1 + fs/hostfs/hostfs_kern.c | 1 + fs/hpfs/super.c | 1 + fs/squashfs/super.c | 1 + fs/udf/super.c | 1 + 5 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8..3cf8a15af91 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -777,6 +777,7 @@ struct file_system_type cifs_fs_type = { .kill_sb = cifs_kill_sb, /* .fs_flags */ }; +MODULE_ALIAS_FS("cifs"); const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, .atomic_open = cifs_atomic_open, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fbabb906066..e3c6d504a2b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -993,6 +993,7 @@ static struct file_system_type hostfs_type = { .kill_sb = hostfs_kill_sb, .fs_flags = 0, }; +MODULE_ALIAS_FS("hostfs"); static int __init init_hostfs(void) { diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a3076228523..a0617e70695 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -688,6 +688,7 @@ static struct file_system_type hpfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hpfs"); static int __init init_hpfs_fs(void) { diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 260e3928d4f..60553a9053c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -489,6 +489,7 @@ static struct file_system_type squashfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; +MODULE_ALIAS_FS("squashfs"); static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, diff --git a/fs/udf/super.c b/fs/udf/super.c index bc5b30a819e..9ac4057a86c 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -118,6 +118,7 @@ static struct file_system_type udf_fstype = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("udf"); static struct kmem_cache *udf_inode_cachep; -- cgit v1.2.3-70-g09d2 From ad56edad089b56300fd13bb9eeb7d0424d978239 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2013 13:24:56 -0400 Subject: jbd2: fix use after free in jbd2_journal_dirty_metadata() jbd2_journal_dirty_metadata() didn't get a reference to journal_head it was working with. This is OK in most of the cases since the journal head should be attached to a transaction but in rare occasions when we are journalling data, __ext4_journalled_writepage() can race with jbd2_journal_invalidatepage() stripping buffers from a page and thus journal head can be freed under hands of jbd2_journal_dirty_metadata(). Fix the problem by getting own journal head reference in jbd2_journal_dirty_metadata() (and also in jbd2_journal_set_triggers() which can possibly have the same issue). Reported-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/transaction.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b..325bc019ed8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1065,9 +1065,12 @@ out: void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */ -- cgit v1.2.3-70-g09d2 From 68ac8bfb6aa4ce1a146b5a291f03fc2dd6d17cf2 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 27 Feb 2013 14:43:09 -0500 Subject: quota: add missing use of dq_data_lock in __dquot_initialize The bulk of __dquot_initialize runs under the dqptr_sem which protects the inode->i_dquot pointers. It doesn't protect the dereferenced contents, though. Those are protected by the dq_data_lock, which is missing around the dquot_resv_space call. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/quota/dquot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 05ae3c97f7a..3e64169ef52 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1439,8 +1439,11 @@ static void __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) + if (unlikely(rsv)) { + spin_lock(&dq_data_lock); dquot_resv_space(inode->i_dquot[cnt], rsv); + spin_unlock(&dq_data_lock); + } } } out_err: -- cgit v1.2.3-70-g09d2 From 8d0c2d10dd72c5292eda7a06231056a4c972e4cc Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 9 Mar 2013 15:28:44 +0100 Subject: ext3: Fix format string issues ext3_msg() takes the printk prefix as the second parameter and the format string as the third parameter. Two callers of ext3_msg omit the prefix and pass the format string as the second parameter and the first parameter to the format string as the third parameter. In both cases this string comes from an arbitrary source. Which means the string may contain format string characters, which will lead to undefined and potentially harmful behavior. The issue was introduced in commit 4cf46b67eb("ext3: Unify log messages in ext3") and is fixed by this patch. CC: stable@vger.kernel.org Signed-off-by: Lars-Peter Clausen Signed-off-by: Jan Kara --- fs/ext3/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 1d6e2ed8532..fb5120a5505 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -353,7 +353,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext3_msg(sb, "error: failed to open journal device %s: %ld", + ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; @@ -887,7 +887,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - ext3_msg(sb, "error: invalid sb specification: %s", + ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", (char *) *data); return 1; } -- cgit v1.2.3-70-g09d2 From af591ad896ef75585752ac2eab4fba9437f23322 Mon Sep 17 00:00:00 2001 From: Ionut-Gabriel Radu Date: Sun, 10 Mar 2013 15:06:23 +0200 Subject: reiserfs: Use kstrdup instead of kmalloc/strcpy Signed-off-by: Ionut-Gabriel Radu Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 194113b1b11..f8a23c3078f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1147,8 +1147,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "on filesystem root."); return 0; } - qf_names[qtype] = - kmalloc(strlen(arg) + 1, GFP_KERNEL); + qf_names[qtype] = kstrdup(arg, GFP_KERNEL); if (!qf_names[qtype]) { reiserfs_warning(s, "reiserfs-2502", "not enough memory " @@ -1156,7 +1155,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "quotafile name."); return 0; } - strcpy(qf_names[qtype], arg); if (qtype == USRQUOTA) *mount_options |= 1 << REISERFS_USRQUOTA; else -- cgit v1.2.3-70-g09d2 From 90ba983f6889e65a3b506b30dc606aa9d1d46cd2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 11 Mar 2013 23:39:59 -0400 Subject: ext4: use atomic64_t for the per-flexbg free_clusters count A user who was using a 8TB+ file system and with a very large flexbg size (> 65536) could cause the atomic_t used in the struct flex_groups to overflow. This was detected by PaX security patchset: http://forums.grsecurity.net/viewtopic.php?f=3&t=3289&p=12551#p12551 This bug was introduced in commit 9f24e4208f7e, so it's been around since 2.6.30. :-( Fix this by using an atomic64_t for struct orlav_stats's free_clusters. Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 6 +++--- fs/ext4/ialloc.c | 4 ++-- fs/ext4/mballoc.c | 12 ++++++------ fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba31526..167ff564bbf 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075d..6c5bb8d993f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b2ea9f7500..ee6614bdb63 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4661,8 +4661,8 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4804,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb9..c169477a62c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9379b7fbfd9..d1ee6a84338 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1923,8 +1923,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } -- cgit v1.2.3-70-g09d2 From a930d8790552658140d7d0d2e316af4f0d76a512 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 12 Mar 2013 02:59:49 +0000 Subject: vfs: fix pipe counter breakage If you open a pipe for neither read nor write, the pipe code will not add any usage counters to the pipe, causing the 'struct pipe_inode_info" to be potentially released early. That doesn't normally matter, since you cannot actually use the pipe, but the pipe release code - particularly fasync handling - still expects the actual pipe infrastructure to all be there. And rather than adding NULL pointer checks, let's just disallow this case, the same way we already do for the named pipe ("fifo") case. This is ancient going back to pre-2.4 days, and until trinity, nobody naver noticed. Reported-by: Dave Jones Signed-off-by: Linus Torvalds --- fs/pipe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 64a494cef0a..2234f3f61f8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -863,6 +863,9 @@ pipe_rdwr_open(struct inode *inode, struct file *filp) { int ret = -ENOENT; + if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE))) + return -EINVAL; + mutex_lock(&inode->i_mutex); if (inode->i_pipe) { -- cgit v1.2.3-70-g09d2 From 4f42f80a8f08d4c3f52c4267361241885d5dee3a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 12 Mar 2013 12:40:04 -0400 Subject: ext4: use s_extent_max_zeroout_kb value as number of kb Currently when converting extent to initialized, we have to decide whether to zeroout part/all of the uninitialized extent in order to avoid extent tree growing rapidly. The decision is made by comparing the size of the extent with the configurable value s_extent_max_zeroout_kb which is in kibibytes units. However when converting it to number of blocks we currently use it as it was in bytes. This is obviously bug and it will result in ext4 _never_ zeroout extents, but rather always split and convert parts to initialized while leaving the rest uninitialized in default setting. Fix this by using s_extent_max_zeroout_kb as kibibytes. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd69e906bd9..e2bb929bea9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3264,7 +3264,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { -- cgit v1.2.3-70-g09d2 From 8aec0f5d4137532de14e6554fd5dd201ff3a3c49 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 25 Feb 2013 10:20:36 -0500 Subject: Fix: compat_rw_copy_check_uvector() misuse in aio, readv, writev, and security keys Looking at mm/process_vm_access.c:process_vm_rw() and comparing it to compat_process_vm_rw() shows that the compatibility code requires an explicit "access_ok()" check before calling compat_rw_copy_check_uvector(). The same difference seems to appear when we compare fs/read_write.c:do_readv_writev() to fs/compat.c:compat_do_readv_writev(). This subtle difference between the compat and non-compat requirements should probably be debated, as it seems to be error-prone. In fact, there are two others sites that use this function in the Linux kernel, and they both seem to get it wrong: Now shifting our attention to fs/aio.c, we see that aio_setup_iocb() also ends up calling compat_rw_copy_check_uvector() through aio_setup_vectored_rw(). Unfortunately, the access_ok() check appears to be missing. Same situation for security/keys/compat.c:compat_keyctl_instantiate_key_iov(). I propose that we add the access_ok() check directly into compat_rw_copy_check_uvector(), so callers don't have to worry about it, and it therefore makes the compat call code similar to its non-compat counterpart. Place the access_ok() check in the same location where copy_from_user() can trigger a -EFAULT error in the non-compat code, so the ABI behaviors are alike on both compat and non-compat. While we are here, fix compat_do_readv_writev() so it checks for compat_rw_copy_check_uvector() negative return values. And also, fix a memory leak in compat_keyctl_instantiate_key_iov() error handling. Acked-by: Linus Torvalds Acked-by: Al Viro Signed-off-by: Mathieu Desnoyers Signed-off-by: Linus Torvalds --- fs/compat.c | 15 +++++++-------- mm/process_vm_access.c | 8 -------- security/keys/compat.c | 4 ++-- 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index fe40fde2911..d487985dd0e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -558,6 +558,10 @@ ssize_t compat_rw_copy_check_uvector(int type, } *ret_pointer = iov; + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + /* * Single unix specification: * We should -EINVAL if an element length is not >= 0 and fitting an @@ -1080,17 +1084,12 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (!file->f_op) goto out; - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - - tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, + ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); - if (tot_len == 0) { - ret = 0; + if (ret <= 0) goto out; - } + tot_len = ret; ret = rw_verify_area(type, file, pos, tot_len); if (ret < 0) goto out; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 926b4664974..fd26d043350 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid, if (flags != 0) return -EINVAL; - if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) - goto out; - - if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) - goto out; - if (vm_write) rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, iovstack_l, @@ -459,8 +453,6 @@ free_iovecs: kfree(iov_r); if (iov_l != iovstack_l) kfree(iov_l); - -out: return rc; } diff --git a/security/keys/compat.c b/security/keys/compat.c index 1c261763f47..d65fa7fa29b 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -40,12 +40,12 @@ static long compat_keyctl_instantiate_key_iov( ARRAY_SIZE(iovstack), iovstack, &iov); if (ret < 0) - return ret; + goto err; if (ret == 0) goto no_payload_free; ret = keyctl_instantiate_key_common(id, iov, ioc, ret, ringid); - +err: if (iov != iovstack) kfree(iov); return ret; -- cgit v1.2.3-70-g09d2 From fa7614ddd6c2368b8cd54cc67ab4b767af0a2a50 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 12 Mar 2013 18:27:41 -0700 Subject: fs: Readd the fs module aliases. I had assumed that the only use of module aliases for filesystems prior to "fs: Limit sys_mount to only request filesystem modules." was in request_module. It turns out I was wrong. At least mkinitcpio in Arch linux uses these aliases. So readd the preexising aliases, to keep from breaking userspace. Userspace eventually will have to follow and use the same aliases the kernel does. So at some point we may be delete these aliases without problems. However that day is not today. Signed-off-by: "Eric W. Biederman" --- fs/ext4/super.c | 2 ++ fs/freevxfs/vxfs_super.c | 1 + fs/isofs/inode.c | 1 + fs/nfs/super.c | 1 + fs/sysv/super.c | 1 + net/sunrpc/rpc_pipe.c | 1 + 6 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 34e85521923..b3818b48f41 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -91,6 +91,7 @@ static struct file_system_type ext2_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) @@ -106,6 +107,7 @@ static struct file_system_type ext3_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) #else #define IS_EXT3_SB(sb) (0) diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 45507430806..e37eb274e49 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -258,6 +258,7 @@ static struct file_system_type vxfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ +MODULE_ALIAS("vxfs"); static int __init vxfs_init(void) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a67f16e846a..d9b8aebdeb2 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1557,6 +1557,7 @@ static struct file_system_type iso9660_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("iso9660"); +MODULE_ALIAS("iso9660"); static int __init init_iso9660_fs(void) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 95cdcb208df..2f8a29db0f1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -335,6 +335,7 @@ struct file_system_type nfs4_fs_type = { .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); EXPORT_SYMBOL_GPL(nfs4_fs_type); static int __init register_nfs4_fs(void) diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a39938b1fee..d0c6a007ce8 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -555,6 +555,7 @@ static struct file_system_type v7_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("v7"); +MODULE_ALIAS("v7"); static int __init init_sysv_fs(void) { diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a0f48a51e14..a9129f8d707 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1175,6 +1175,7 @@ static struct file_system_type rpc_pipe_fs_type = { .kill_sb = rpc_kill_sb, }; MODULE_ALIAS_FS("rpc_pipefs"); +MODULE_ALIAS("rpc_pipefs"); static void init_once(void *foo) -- cgit v1.2.3-70-g09d2 From c288d2969627be7ffc90904ac8c6aae0295fbf9f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 Mar 2013 12:57:08 +0100 Subject: ext2: Fix BUG_ON in evict() on inode deletion Commit 8e3dffc6 introduced a regression where deleting inode with large extended attributes leads to triggering BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)) in fs/inode.c:evict(). That happens because freeing of xattr block dirtied the inode and it happened after clear_inode() has been called. Fix the issue by moving removal of xattr block into ext2_evict_inode() before clear_inode() call close to a place where data blocks are truncated. That is also more logical place and removes surprising requirement that ext2_free_blocks() mustn't dirty the inode. Reported-by: Tyler Hicks Signed-off-by: Jan Kara --- fs/ext2/ialloc.c | 1 - fs/ext2/inode.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8f370e012e6..7cadd823bb3 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -118,7 +118,6 @@ void ext2_free_inode (struct inode * inode) * as writing the quota to disk may need the lock as well. */ /* Quota is already initialized in iput() */ - ext2_xattr_delete_inode(inode); dquot_free_inode(inode); dquot_drop(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c3881e56662..fe60cc1117d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,6 +34,7 @@ #include "ext2.h" #include "acl.h" #include "xip.h" +#include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -88,6 +89,7 @@ void ext2_evict_inode(struct inode * inode) inode->i_size = 0; if (inode->i_blocks) ext2_truncate_blocks(inode, 0); + ext2_xattr_delete_inode(inode); } invalidate_inode_buffers(inode); -- cgit v1.2.3-70-g09d2 From 47c78f4a70d791ff44cab3254b489605a52e3181 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Mon, 11 Mar 2013 13:08:49 +0000 Subject: cifs: map NT_STATUS_SHARING_VIOLATION to EBUSY instead of ETXTBSY NT_SHARING_VIOLATION errors are mapped to ETXTBSY which is unexpected for operations such as unlink where we can hit these errors. The patch maps the error NT_SHARING_VIOLATION to EBUSY instead. The patch also replaces all instances of ETXTBSY in cifs_rename_pending_delete() with EBUSY. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 10 ++++------ fs/cifs/netmisc.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 0079696305c..20887bf6312 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1043,7 +1043,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_setattr; } @@ -1062,7 +1062,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, if (rc == -ENOENT) rc = 0; else if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_rename; } cifsInode->delete_pending = true; @@ -1169,15 +1169,13 @@ psx_del_no_retry: cifs_drop_nlink(inode); } else if (rc == -ENOENT) { d_drop(dentry); - } else if (rc == -ETXTBSY) { + } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, dentry, xid); if (rc == 0) cifs_drop_nlink(inode); } - if (rc == -ETXTBSY) - rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1518,7 +1516,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ - if (rc == 0 || rc != -ETXTBSY) + if (rc == 0 || rc != -EBUSY) goto do_rename_exit; /* open-file renames don't work across directories */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc8..c0b25b28be6 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, - {ERRbadshare, -ETXTBSY}, + {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, -- cgit v1.2.3-70-g09d2 From 24261fc23db950951760d00c188ba63cc756b932 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 8 Mar 2013 16:30:03 +0100 Subject: cifs: delay super block destruction until all cifsFileInfo objects are gone cifsFileInfo objects hold references to dentries and it is possible that these will still be around in workqueues when VFS decides to kill super block during unmount. This results in panics like this one: BUG: Dentry ffff88001f5e76c0{i=66b4a,n=1M-2} still in use (1) [unmount of cifs cifs] ------------[ cut here ]------------ kernel BUG at fs/dcache.c:943! [..] Process umount (pid: 1781, threadinfo ffff88003d6e8000, task ffff880035eeaec0) [..] Call Trace: [] shrink_dcache_for_umount+0x33/0x60 [] generic_shutdown_super+0x2c/0xe0 [] kill_anon_super+0x16/0x30 [] cifs_kill_sb+0x1a/0x30 [cifs] [] deactivate_locked_super+0x57/0x80 [] deactivate_super+0x4e/0x70 [] mntput_no_expire+0xd7/0x130 [] sys_umount+0x9c/0x3c0 [] system_call_fastpath+0x16/0x1b Fix this by making each cifsFileInfo object hold a reference to cifs super block, which implicitly keeps VFS super block around as well. Signed-off-by: Mateusz Guzik Reviewed-by: Jeff Layton Cc: Reported-and-Tested-by: Ben Greear Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 24 ++++++++++++++++++++++++ fs/cifs/cifsfs.h | 4 ++++ fs/cifs/file.c | 6 +++++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8..054b90b682a 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct *cifsiod_wq; __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + static int cifs_read_super(struct super_block *sb) { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd..0e32c3446ce 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c0d8557731..7a0dd99e450 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -300,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + cifs_sb_active(inode->i_sb); + /* * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. @@ -349,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; @@ -414,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); + cifs_sb_deactive(sb); kfree(cifs_file); } -- cgit v1.2.3-70-g09d2 From 801cb2d62d5f673ac671c01397956d8b77a5215b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 14:59:36 -0700 Subject: nfsd: remove unused get_new_stid() get_new_stid() is no longer used since commit 3abdb607125 ("nfsd4: simplify idr allocation"). Remove it. Signed-off-by: Tejun Heo Acked-by: J. Bruce Fields Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 16d39c6c4fb..d91d6dbf698 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -230,37 +230,6 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } -static inline int get_new_stid(struct nfs4_stid *stid) -{ - static int min_stateid = 0; - struct idr *stateids = &stid->sc_client->cl_stateids; - int new_stid; - int error; - - error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); - /* - * Note: the necessary preallocation was done in - * nfs4_alloc_stateid(). The idr code caps the number of - * preallocations that can exist at a time, but the state lock - * prevents anyone from using ours before we get here: - */ - WARN_ON_ONCE(error); - /* - * It shouldn't be a problem to reuse an opaque stateid value. - * I don't think it is for 4.1. But with 4.0 I worry that, for - * example, a stray write retransmission could be accepted by - * the server when it should have been rejected. Therefore, - * adopt a trick from the sctp code to attempt to maximize the - * amount of time until an id is reused, by ensuring they always - * "increase" (mod INT_MAX): - */ - - min_stateid = new_stid+1; - if (min_stateid == INT_MAX) - min_stateid = 0; - return new_stid; -} - static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { -- cgit v1.2.3-70-g09d2 From ebd6c70714f5eda9cd1b60d23754ffd1d62481f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 14:59:37 -0700 Subject: nfsd: convert to idr_alloc() idr_get_new*() and friends are about to be deprecated. Convert to the new idr_alloc() interface. Only compile-tested. Signed-off-by: Tejun Heo Acked-by: J. Bruce Fields Tested-by: J. Bruce Fields Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d91d6dbf698..2e27430b907 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -242,9 +242,8 @@ kmem_cache *slab) if (!stid) return NULL; - if (!idr_pre_get(stateids, GFP_KERNEL)) - goto out_free; - if (idr_get_new_above(stateids, stid, min_stateid, &new_id)) + new_id = idr_alloc(stateids, stid, min_stateid, 0, GFP_KERNEL); + if (new_id < 0) goto out_free; stid->sc_client = cl; stid->sc_type = 0; -- cgit v1.2.3-70-g09d2 From a09a0a705dd6c80bc96b5e6f18dc103d4e1a7d63 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:20:58 +0000 Subject: Btrfs: get better concurrency for snapshot-aware defrag work Using spinning case instead of blocking will result in better concurrency overall. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13ab4de0a40..1f26888825e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2312,6 +2312,7 @@ again: key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; + path->leave_spinning = 1; if (merge) { struct btrfs_file_extent_item *fi; u64 extent_len; @@ -2368,6 +2369,7 @@ again: btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, len); + btrfs_release_path(path); ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, @@ -2381,6 +2383,7 @@ again: ret = 1; out_free_path: btrfs_release_path(path); + path->leave_spinning = 0; btrfs_end_transaction(trans, root); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, -- cgit v1.2.3-70-g09d2 From d340d2475c6e394013325f83f499594628a9e558 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:37:45 +0000 Subject: Btrfs: remove btrfs_try_spin_lock Remove a useless function declaration Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/locking.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index ca52681e5f4..b81e0e9a489 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -26,7 +26,6 @@ void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); -- cgit v1.2.3-70-g09d2 From bc178622d40d87e75abc131007342429c9b03351 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 9 Mar 2013 15:18:39 +0000 Subject: btrfs: use rcu_barrier() to wait for bdev puts at unmount Doing this would reliably fail with -EBUSY for me: # mount /dev/sdb2 /mnt/scratch; umount /mnt/scratch; mkfs.btrfs -f /dev/sdb2 ... unable to open /dev/sdb2: Device or resource busy because mkfs.btrfs tries to open the device O_EXCL, and somebody still has it. Using systemtap to track bdev gets & puts shows a kworker thread doing a blkdev put after mkfs attempts a get; this is left over from the unmount path: btrfs_close_devices __btrfs_close_devices call_rcu(&device->rcu, free_device); free_device INIT_WORK(&device->rcu_work, __free_device); schedule_work(&device->rcu_work); so unmount might complete before __free_device fires & does its blkdev_put. Adding an rcu_barrier() to btrfs_close_devices() causes unmount to wait until all blkdev_put()s are done, and the device is truly free once unmount completes. Cc: stable@vger.kernel.org Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b9cff42265..5989a92236f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -684,6 +684,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } + /* + * Wait for rcu kworkers under __btrfs_close_devices + * to finish all blkdev_puts so device is really + * free when umount is done. + */ + rcu_barrier(); return ret; } -- cgit v1.2.3-70-g09d2 From 492104c866cb1b62a11393adccb477f5cd2c7768 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Mar 2013 15:41:02 -0500 Subject: Btrfs: return EIO if we have extent tree corruption The callers of lookup_inline_extent_info all handle getting an error back properly, so return an error if we have corruption instead of being a jerk and panicing. Still WARN_ON() since this is kind of crucial and I've been seeing it a bit too much recently for my taste, I think we're doing something wrong somewhere. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aaee2b7fee7..350b9b18140 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1467,8 +1467,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret && !insert) { err = -ENOENT; goto out; + } else if (ret) { + err = -EIO; + WARN_ON(1); + goto out; } - BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); -- cgit v1.2.3-70-g09d2 From 720f1e2060138855b4a1b1e8aa642f9c7feb6750 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 6 Mar 2013 11:51:47 +0000 Subject: Btrfs: return as soon as possible when edquot happens If one of qgroup fails to reserve firstly, we should return immediately, it is unnecessary to continue check. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aee4b1cc3d9..5471e47d655 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1525,21 +1525,23 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + qg->rfer + num_bytes > - qg->max_rfer) + qg->max_rfer) { ret = -EDQUOT; + goto out; + } if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && qg->reserved + qg->excl + num_bytes > - qg->max_excl) + qg->max_excl) { ret = -EDQUOT; + goto out; + } list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); } } - if (ret) - goto out; /* * no limits exceeded, now record the reservation into all qgroups -- cgit v1.2.3-70-g09d2 From 7c2ec3f0730729f4829d01f7c19538d135f86712 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 13 Mar 2013 07:43:03 -0600 Subject: Btrfs: fix warning when creating snapshots Creating snapshot passes extent_root to commit its transaction, but it can lead to the warning of checking root for quota in the __btrfs_end_transaction() when someone else is committing the current transaction. Since we've recorded the needed root in trans_handle, just use it to get rid of the warning. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fedede1fe17..c4a1531c52d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -626,14 +626,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - /* - * the same root has to be passed to start_transaction and - * end_transaction. Subvolume quota depends on this. - */ - WARN_ON(trans->root != root); if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); + /* + * the same root has to be passed here between start_transaction + * and end_transaction. Subvolume quota depends on this. + */ + btrfs_qgroup_free(trans->root, trans->qgroup_reserved); trans->qgroup_reserved = 0; } -- cgit v1.2.3-70-g09d2 From 3b2775942d6ccb14342f3aae55f22fbbfea8db14 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Mar 2013 08:46:39 -0600 Subject: Btrfs: fix warning of free_extent_map Users report that an extent map's list is still linked when it's actually going to be freed from cache. The story is that a) when we're going to drop an extent map and may split this large one into smaller ems, and if this large one is flagged as EXTENT_FLAG_LOGGING which means that it's on the list to be logged, then the smaller ems split from it will also be flagged as EXTENT_FLAG_LOGGING, and this is _not_ expected. b) we'll keep ems from unlinking the list and freeing when they are flagged with EXTENT_FLAG_LOGGING, because the log code holds one reference. The end result is the warning, but the truth is that we set the flag EXTENT_FLAG_LOGGING only during fsync. So clear flag EXTENT_FLAG_LOGGING for extent maps split from a large one. Reported-by: Johannes Hirte Reported-by: Darrick J. Wong Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 83c790d8403..7bdb47faa12 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -591,6 +591,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); remove_extent_mapping(em_tree, em); if (no_splits) goto next; -- cgit v1.2.3-70-g09d2 From 0e401101db49959f5783f6ee9e676124b5a183ac Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 18 Mar 2013 11:40:19 -0400 Subject: ext4: fix memory leakage in mext_check_coverage Regression was introduced by following commit 8c854473 TESTCASE (git://oss.sgi.com/xfs/cmds/xfstests.git): #while true;do ./check 301 || break ;done Also fix potential memory leakage in get_ext_path() once ext4_ext_find_extent() have failed. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c1f15b203e9..bbae4ed15c3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** -- cgit v1.2.3-70-g09d2 From 83cdadd8b0559c93728d065d23ca3485fa567e54 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 22 Feb 2013 13:32:56 -0500 Subject: xfs: fix potential infinite loop in xfs_iomap_prealloc_size() If freesp == 0, we could end up in an infinite loop while squashing the preallocation. Break the loop when we've killed the prealloc entirely. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e78c420bfc2608bb5f9a0b9165b1071c1e31166a) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 912d83d8860..b0b0f448e84 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -413,7 +413,7 @@ xfs_iomap_prealloc_size( * have a large file on a small filesystem and the above * lowspace thresholds are smaller than MAXEXTLEN. */ - while (alloc_blocks >= freesp) + while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; } -- cgit v1.2.3-70-g09d2 From 3325beed46d8d14d873e94d89ea57ee900dec942 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Sun, 24 Feb 2013 13:04:37 -0600 Subject: xfs: fix xfs_iomap_eof_prealloc_initial_size type Fix the return type of xfs_iomap_eof_prealloc_initial_size() to xfs_fsblock_t to reflect the fact that the return value may be an unsigned 64 bits if XFS_BIG_BLKNOS is defined. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e8108cedb1c5d1dc359690d18ca997e97a0061d2) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b0b0f448e84..5a30dd899d2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -325,7 +325,7 @@ xfs_iomap_eof_want_preallocate( * rather than falling short due to things like stripe unit/width alignment of * real extents. */ -STATIC int +STATIC xfs_fsblock_t xfs_iomap_eof_prealloc_initial_size( struct xfs_mount *mp, struct xfs_inode *ip, -- cgit v1.2.3-70-g09d2 From e001873853d87674dd5b3cfa2851885023616695 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Mar 2013 23:30:34 +1100 Subject: xfs: ensure we capture IO errors correctly Failed buffer readahead can leave the buffer in the cache marked with an error. Most callers that then issue a subsequent read on the buffer do not zero the b_error field out, and so we may incorectly detect an error during IO completion due to the stale error value left on the buffer. Avoid this problem by zeroing the error before IO submission. This ensures that the only IO errors that are detected those captured from are those captured from bio submission or completion. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit c163f9a1760229a95d04e37b332de7d5c1c225cd) --- fs/xfs/xfs_buf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4e8f0df82d0..8459b5d8cb7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1334,6 +1334,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; -- cgit v1.2.3-70-g09d2 From a517b608fa3d9b65930ef53ffe4a2f9800e10f7d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Mar 2013 10:49:07 -0400 Subject: nfsd: only unhash DRC entries that are in the hashtable It's not safe to call hlist_del() on a newly initialized hlist_node. That leads to a NULL pointer dereference. Only do that if the entry is hashed. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 62c1ee128ae..18509bd6f58 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -102,7 +102,8 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); - hlist_del(&rp->c_hash); + if (!hlist_unhashed(&rp->c_hash)) + hlist_del(&rp->c_hash); list_del(&rp->c_lru); --num_drc_entries; kmem_cache_free(drc_slab, rp); -- cgit v1.2.3-70-g09d2 From ac534ff2d5508bdff1358a55d88053da729ff46b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 15 Mar 2013 09:16:29 -0400 Subject: nfsd: fix startup order in nfsd_reply_cache_init If we end up doing "goto out_nomem" in this function, we'll call nfsd_reply_cache_shutdown. That will attempt to walk the LRU list and free entries, but that list may not be initialized yet if the server is starting up for the first time. It's also possible for the shrinker to kick in before we've initialized the LRU list. Rearrange the initialization so that the LRU list_head and cache size are initialized before doing any of the allocations that might fail. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 18509bd6f58..ca05f6dc354 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -119,6 +119,10 @@ nfsd_reply_cache_free(struct svc_cacherep *rp) int nfsd_reply_cache_init(void) { + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + register_shrinker(&nfsd_reply_cache_shrinker); drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); @@ -129,10 +133,6 @@ int nfsd_reply_cache_init(void) if (!cache_hash) goto out_nomem; - INIT_LIST_HEAD(&lru_head); - max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; - return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); -- cgit v1.2.3-70-g09d2 From 1ada47d9468fe3907f7f9e00179168f5e2f90803 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:39:42 -0400 Subject: ext4: fix ext4_evict_inode() racing against workqueue processing code Commit 84c17543ab56 (ext4: move work from io_end to inode) triggered a regression when running xfstest #270 when the file system is mounted with dioread_nolock. The problem is that after ext4_evict_inode() calls ext4_ioend_wait(), this guarantees that last io_end structure has been freed, but it does not guarantee that the workqueue structure, which was moved into the inode by commit 84c17543ab56, is actually finished. Once ext4_flush_completed_IO() calls ext4_free_io_end() on CPU #1, this will allow ext4_ioend_wait() to return on CPU #2, at which point the evict_inode() codepath can race against the workqueue code on CPU #1 accessing EXT4_I(inode)->i_unwritten_work to find the next item of work to do. Fix this by calling cancel_work_sync() in ext4_ioend_wait(), which will be renamed ext4_ioend_shutdown(), since it is only used by ext4_evict_inode(). Also, move the call to ext4_ioend_shutdown() until after truncate_inode_pages() and filemap_write_and_wait() are called, to make sure all dirty pages have been written back and flushed from the page cache first. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cwq_activate_delayed_work+0x3b/0x7e *pdpt = 0000000030bc3001 *pde = 0000000000000000 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: Pid: 6, comm: kworker/u:0 Not tainted 3.8.0-rc3-00013-g84c1754-dirty #91 Bochs Bochs EIP: 0060:[] EFLAGS: 00010046 CPU: 0 EIP is at cwq_activate_delayed_work+0x3b/0x7e EAX: 00000000 EBX: 00000000 ECX: f505fe54 EDX: 00000000 ESI: ed5b697c EDI: 00000006 EBP: f64b7e8c ESP: f64b7e84 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: 00000000 CR3: 30bc2000 CR4: 000006f0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process kworker/u:0 (pid: 6, ti=f64b6000 task=f64b4160 task.ti=f64b6000) Stack: f505fe00 00000006 f64b7e9c c01de3d7 f6435540 00000003 f64b7efc c01def1d f6435540 00000002 00000000 0000008a c16d0808 c040a10b c16d07d8 c16d08b0 f505fe00 c16d0780 00000000 00000000 ee153df4 c1ce4a30 c17d0e30 00000000 Call Trace: [] cwq_dec_nr_in_flight+0x71/0xfb [] process_one_work+0x5d8/0x637 [] ? ext4_end_bio+0x300/0x300 [] worker_thread+0x249/0x3ef [] kthread+0xd8/0xeb [] ? manage_workers+0x4bb/0x4bb [] ? trace_hardirqs_on+0x27/0x37 [] ret_from_kernel_thread+0x1b/0x28 [] ? __init_kthread_worker+0x71/0x71 Code: 01 83 15 ac ff 6c c1 00 31 db 89 c6 8b 00 a8 04 74 12 89 c3 30 db 83 05 b0 ff 6c c1 01 83 15 b4 ff 6c c1 00 89 f0 e8 42 ff ff ff <8b> 13 89 f0 83 05 b8 ff 6c c1 6c c1 00 31 c9 83 EIP: [] cwq_activate_delayed_work+0x3b/0x7e SS:ESP 0068:f64b7e84 CR2: 0000000000000000 ---[ end trace a1923229da53d8a4 ]--- Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/page-io.c | 12 +++++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 167ff564bbf..3b83cd60479 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, extern int __init ext4_init_pageio(void); extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 65bbc9339ac..ea5f24ffa60 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -216,6 +214,7 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -225,6 +224,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ec..047a6de04a0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } static void put_io_page(struct ext4_io_page *io_page) -- cgit v1.2.3-70-g09d2 From 2b405bfa84063bfa35621d2d6879f52693c614b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:42:11 -0400 Subject: ext4: fix data=journal fast mount/umount hang In data=journal mode, if we unmount the file system before a transaction has a chance to complete, when the journal inode is being evicted, we can end up calling into jbd2_log_wait_commit() for the last transaction, after the journalling machinery has been shut down. Arguably we should adjust ext4_should_journal_data() to return FALSE for the journal inode, but the only place it matters is ext4_evict_inode(), and so to save a bit of CPU time, and to make the patch much more obviously correct by inspection(tm), we'll fix it by explicitly not trying to waiting for a journal commit when we are evicting the journal inode, since it's guaranteed to never succeed in this case. This can be easily replicated via: mount -t ext4 -o data=journal /dev/vdb /vdb ; umount /vdb ------------[ cut here ]------------ WARNING: at /usr/projects/linux/ext4/fs/jbd2/journal.c:542 __jbd2_log_start_commit+0xba/0xcd() Hardware name: Bochs JBD2: bad log_start_commit: 3005630206 3005630206 0 0 Modules linked in: Pid: 2909, comm: umount Not tainted 3.8.0-rc3 #1020 Call Trace: [] warn_slowpath_common+0x68/0x7d [] ? __jbd2_log_start_commit+0xba/0xcd [] warn_slowpath_fmt+0x2b/0x2f [] __jbd2_log_start_commit+0xba/0xcd [] jbd2_log_start_commit+0x24/0x34 [] ext4_evict_inode+0x71/0x2e3 [] evict+0x94/0x135 [] iput+0x10a/0x110 [] jbd2_journal_destroy+0x190/0x1ce [] ? bit_waitqueue+0x50/0x50 [] ext4_put_super+0x52/0x294 [] generic_shutdown_super+0x48/0xb4 [] kill_block_super+0x22/0x60 [] deactivate_locked_super+0x22/0x49 [] deactivate_super+0x30/0x33 [] mntput_no_expire+0x107/0x10c [] sys_umount+0x2cf/0x2e0 [] sys_oldumount+0x12/0x14 [] syscall_call+0x7/0xb ---[ end trace 6a954cc790501c1f ]--- jbd2_log_wait_commit: error: j_commit_request=-1289337090, tid=0 Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea5f24ffa60..85e41a2a39a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -205,7 +205,8 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; -- cgit v1.2.3-70-g09d2 From cf4ab538f1516606d3ae730dce15d6f33d96b7e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Mar 2013 12:56:37 -0500 Subject: NFSv4: Fix the string length returned by the idmapper Functions like nfs_map_uid_to_name() and nfs_map_gid_to_group() are expected to return a string without any terminating NUL character. Regression introduced by commit 57e62324e469e092ecc6c94a7a86fe4bd6ac5172 (NFS: Store the legacy idmapper result in the keyring). Reported-by: Dave Chiluk Signed-off-by: Trond Myklebust Cc: Bryan Schumaker Cc: stable@vger.kernel.org [>=3.4] --- fs/nfs/idmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dc0f98dfa71..c516da5873f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -726,9 +726,9 @@ out1: return ret; } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - return key_instantiate_and_link(key, data, strlen(data) + 1, + return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } @@ -738,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; + size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ @@ -747,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; - sprintf(id_str, "%d", im->im_id); - ret = nfs_idmap_instantiate(key, authkey, id_str); + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; - ret = nfs_idmap_instantiate(key, authkey, im->im_name); + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; -- cgit v1.2.3-70-g09d2 From 991f76f837bf22c5bb07261cfd86525a0a96650c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:24 +0800 Subject: sysfs: fix race between readdir and lseek While readdir() is running, lseek() may set filp->f_pos as zero, then may leave filp->private_data pointing to one sysfs_dirent object without holding its reference counter, so the sysfs_dirent object may be used after free in next readdir(). This patch holds inode->i_mutex to avoid the problem since the lock is always held in readdir path. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25..c9e16608f48 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1058,10 +1058,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; -- cgit v1.2.3-70-g09d2 From e5110f411d2ee35bf8d202ccca2e89c633060dca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:25 +0800 Subject: sysfs: handle failure path correctly for readdir() In case of 'if (filp->f_pos == 0 or 1)' of sysfs_readdir(), the failure from filldir() isn't handled, and the reference counter of the sysfs_dirent object pointed by filp->private_data will be released without clearing filp->private_data, so use after free bug will be triggered later. This patch returns immeadiately under the situation for fixing the bug, and it is reasonable to return from readdir() when filldir() fails. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c9e16608f48..e14512678c9 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); -- cgit v1.2.3-70-g09d2 From 4376c94618c26225e69e17b7c91169c45a90b292 Mon Sep 17 00:00:00 2001 From: fanchaoting Date: Thu, 21 Mar 2013 09:15:30 +0800 Subject: pnfs-block: removing DM device maybe cause oops when call dev_remove when pnfs block using device mapper,if umounting later,it maybe cause oops. we apply "1 + sizeof(bl_umount_request)" memory for msg->data, the memory maybe overflow when we do "memcpy(&dataptr [sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request))", because the size of bl_msg is more than 1 byte. Signed-off-by: fanchaoting Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayoutdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17..6fc7b5cae92 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev) bl_pipe_msg.bl_wq = &nn->bl_wq; memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); if (!msg->data) goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev) memcpy(msg->data, &bl_msg, sizeof(bl_msg)); dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; add_wait_queue(&nn->bl_wq, &wq); if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { -- cgit v1.2.3-70-g09d2 From a073dbff359f4741013ae4b8395f5364c5e00b48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 12:34:32 -0400 Subject: NFSv4.1: Fix a race in pNFS layoutcommit We need to clear the NFS_LSEG_LAYOUTCOMMIT bits atomically with the NFS_INO_LAYOUTCOMMIT bit, otherwise we may end up with situations where the two are out of sync. The first half of the problem is to ensure that pnfs_layoutcommit_inode clears the NFS_LSEG_LAYOUTCOMMIT bit through pnfs_list_write_lseg. We still need to keep the reference to those segments until the RPC call is finished, so in order to make it clear _where_ those references come from, we add a helper pnfs_list_write_lseg_done() that cleans up after pnfs_list_write_lseg. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 14 -------------- fs/nfs/pnfs.c | 19 ++++++++++++++++++- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2671cb0f90..6ccdd4fd9b5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6416,22 +6416,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; - struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); - /* Matched by references in pnfs_set_layoutcommit */ - list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { - list_del_init(&lseg->pls_lc_list); - if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, - &lseg->pls_flags)) - pnfs_put_lseg(lseg); - } - - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); - put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 48ac5aad625..3d900916fd4 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1746,11 +1746,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { if (lseg->pls_range.iomode == IOMODE_RW && - test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) list_add(&lseg->pls_lc_list, listp); } } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(inode)->flags; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1795,6 +1811,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) if (nfss->pnfs_curr_ld->cleanup_layoutcommit) nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); } /* -- cgit v1.2.3-70-g09d2 From 24956804349ca0eadcdde032d65e8c00b4214096 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:03:00 -0400 Subject: NFSv4.1: Always clear the NFS_INO_LAYOUTCOMMIT in layoutreturn Note that clearing NFS_INO_LAYOUTCOMMIT is tricky, since it requires you to also clear the NFS_LSEG_LAYOUTCOMMIT bits from the layout segments. The only two sites that need to do this are the ones that call pnfs_return_layout() without first doing a layout commit. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4filelayout.c | 1 - fs/nfs/pnfs.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 49eeb044c10..4fb234d3aef 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -129,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) { if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return; - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); pnfs_return_layout(inode); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3d900916fd4..5044142c121 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range, lo_seg_intersecting(lseg_range, recall_range); } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + /* Returns 1 if lseg is removed from list, 0 otherwise */ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_layout_remove_lseg(lseg->pls_layout, lseg); - list_add(&lseg->pls_list, tmp_list); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; - } } return rv; } @@ -777,6 +784,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, return lseg; } +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -808,6 +830,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { @@ -820,8 +843,6 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; @@ -1458,7 +1479,6 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1613,7 +1633,6 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) -- cgit v1.2.3-70-g09d2 From 240286725d854331422cb15957f8d9bf2741d4e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:23:33 -0400 Subject: NFSv4.1: Add a helper pnfs_commit_and_return_layout In order to be able to safely return the layout in nfs4_proc_setattr, we need to block new uses of the layout, wait for all outstanding users of the layout to complete, commit the layout and then return it. This patch adds a helper in order to do all this safely. Signed-off-by: Trond Myklebust Cc: Boaz Harrosh --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 27 +++++++++++++++++++++++++++ fs/nfs/pnfs.h | 6 ++++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6ccdd4fd9b5..26431cf62dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2632,7 +2632,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; if (pnfs_ld_layoutret_on_setattr(inode)) - pnfs_return_layout(inode); + pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5044142c121..4bdffe0ba02 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -866,6 +866,33 @@ out: } EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94ba8041774..f5f8a470a64 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -219,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -407,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { -- cgit v1.2.3-70-g09d2 From 06ae43f34bcc07a0b6be8bf78a1c895bcd12c839 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Mar 2013 13:19:30 -0400 Subject: Don't bother with redoing rw_verify_area() from default_file_splice_from() default_file_splice_from() ends up calling vfs_write() (via very convoluted callchain). It's an overkill, since we already have done rw_verify_area() in the caller by the time we call vfs_write() we are under set_fs(KERNEL_DS), so access_ok() is also pointless. Add a new helper (__kernel_write()), use it instead of kernel_write() in there. Signed-off-by: Al Viro --- fs/internal.h | 5 +++++ fs/read_write.c | 25 +++++++++++++++++++++++++ fs/splice.c | 4 +++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index 507141fceb9..4be78237d89 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,3 +125,8 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); diff --git a/fs/read_write.c b/fs/read_write.c index a698eff457f..f7b5a23b804 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@ #include #include #include "read_write.h" +#include "internal.h" #include #include @@ -417,6 +418,30 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else + ret = do_sync_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; diff --git a/fs/splice.c b/fs/splice.c index 718bd005638..29e394e49dd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include #include #include +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1048,9 +1049,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { int ret; void *data; + loff_t tmp = sd->pos; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); buf->ops->unmap(pipe, buf, data); return ret; -- cgit v1.2.3-70-g09d2 From f853c616883a8de966873a1dab283f1369e275a1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Mar 2013 09:52:19 -0400 Subject: cifs: ignore everything in SPNEGO blob after mechTypes We've had several reports of people attempting to mount Windows 8 shares and getting failures with a return code of -EINVAL. The default sec= mode changed recently to sec=ntlmssp. With that, we expect and parse a SPNEGO blob from the server in the NEGOTIATE reply. The current decode_negTokenInit function first parses all of the mechTypes and then tries to parse the rest of the negTokenInit reply. The parser however currently expects a mechListMIC or nothing to follow the mechTypes, but Windows 8 puts a mechToken field there instead to carry some info for the new NegoEx stuff. In practice, we don't do anything with the fields after the mechTypes anyway so I don't see any real benefit in continuing to parse them. This patch just has the kernel ignore the fields after the mechTypes. We'll probably need to reinstate some of this if we ever want to support NegoEx. Reported-by: Jason Burgess Reported-by: Yan Li Signed-off-by: Jeff Layton Cc: Signed-off-by: Steve French --- fs/cifs/asn1.c | 53 +++++------------------------------------------------ 1 file changed, 5 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0b..1d36db11477 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -614,53 +614,10 @@ decode_negTokenInit(unsigned char *security_blob, int length, } } - /* mechlistMIC */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - /* Check if we have reached the end of the blob, but with - no mechListMic (e.g. NTLMSSP instead of KRB5) */ - if (ctx.error == ASN1_ERR_DEC_EMPTY) - goto decode_negtoken_exit; - cFYI(1, "Error decoding last part negTokenInit exit3"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ - cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit5"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - } - - /* sequence of */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit 7"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* general string */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit9"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - cFYI(1, "Need to call asn1_octets_decode() function for %s", - ctx.pointer); /* is this UTF-8 or ASCII? */ -decode_negtoken_exit: + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ return 1; } -- cgit v1.2.3-70-g09d2 From d763448286377b8a0e3f179372e9e292bef3c337 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:20:00 +0000 Subject: Btrfs: update to use fs_state bit Now that we use bit operation to check fs_state, update btrfs_free_fs_root()'s checker, otherwise we get back to memory leak case. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d84651e850..127b23e8323 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3253,7 +3253,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); btrfs_free_log_root_tree(NULL, fs_info); } -- cgit v1.2.3-70-g09d2 From 835d974fabfa9bff4d173ad03c054ac2f673263f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Mar 2013 12:13:25 -0400 Subject: Btrfs: handle a bogus chunk tree nicely If you restore a btrfs-image file system and try to mount that file system we'll panic. That's because btrfs-image restores and just makes one big chunk to envelope the whole disk, since they are really only meant to be messed with by our btrfs-progs. So fix up btrfs_rmap_block and the callers of it for mount so that we no longer panic but instead just return an error and fail to mount. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 35 ++++++++++++++++++++++++++++++----- fs/btrfs/volumes.c | 13 ++++++++++++- 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 350b9b18140..a8ff25aedca 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -7964,7 +7969,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -8106,7 +8121,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5989a92236f..2854c824ab6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4935,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; -- cgit v1.2.3-70-g09d2 From 6113077cd319e747875ec71227d2b5cb54e08c76 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 19 Mar 2013 10:57:14 +0000 Subject: Btrfs: fix missing qgroup reservation before fallocating Steps to reproduce: mkfs.btrfs mount btrfs quota enable btrfs sub create /subv btrfs qgroup limit 10M /subv fallocate --length 20M /subv/data For the above example, fallocating will return successfully which is not expected, we try to fix it by doing qgroup reservation before fallocating. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7bdb47faa12..1be25b92d63 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2142,6 +2142,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file->f_path.dentry->d_inode; struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2169,6 +2170,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2272,6 +2278,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; -- cgit v1.2.3-70-g09d2 From d9abbf1c3131b679379762700201ae69367f3f62 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 20 Mar 2013 13:49:48 +0000 Subject: Btrfs: fix locking on ROOT_REPLACE operations in tree mod log To resolve backrefs, ROOT_REPLACE operations in the tree mod log are required to be tied to at least one KEY_REMOVE_WHILE_FREEING operation. Therefore, those operations must be enclosed by tree_mod_log_write_lock() and tree_mod_log_write_unlock() calls. Those calls are private to the tree_mod_log_* functions, which means that removal of the elements of an old root node must be logged from tree_mod_log_insert_root. This partly reverts and corrects commit ba1bfbd5 (Btrfs: fix a tree mod logging issue for root replacement operations). This fixes the brand-new version of xfstest 276 as of commit cfe73f71. Cc: stable@vger.kernel.org Signed-off-by: Jan Schmidt Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e5..ca9d8f1a3bb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), -- cgit v1.2.3-70-g09d2 From 1dd05682b3ef6e70409e130bfd83e91770801589 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 21 Mar 2013 04:32:32 +0000 Subject: Btrfs: fix memory leak in btrfs_create_tree() We should free leaf and root before returning from the error handling code. Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 127b23e8323..6d19a0a554a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); + return root; + fail: - if (ret) - return ERR_PTR(ret); + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); - return root; + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, -- cgit v1.2.3-70-g09d2 From 51f0885e5415b4cc6535e9cdcc5145bfbc134353 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 22 Mar 2013 11:44:04 -0700 Subject: vfs,proc: guarantee unique inodes in /proc Dave Jones found another /proc issue with his Trinity tool: thanks to the namespace model, we can have multiple /proc dentries that point to the same inode, aliasing directories in /proc//net/ for example. This ends up being a total disaster, because it acts like hardlinked directories, and causes locking problems. We rely on the topological sort of the inodes pointed to by dentries, and if we have aliased directories, that odering becomes unreliable. In short: don't do this. Multiple dentries with the same (directory) inode is just a bad idea, and the namespace code should never have exposed things this way. But we're kind of stuck with it. This solves things by just always allocating a new inode during /proc dentry lookup, instead of using "iget_locked()" to look up existing inodes by superblock and number. That actually simplies the code a bit, at the cost of potentially doing more inode [de]allocations. That said, the inode lookup wasn't free either (and did a lot of locking of inodes), so it is probably not that noticeable. We could easily keep the old lookup model for non-directory entries, but rather than try to be excessively clever this just implements the minimal and simplest workaround for the problem. Reported-and-tested-by: Dave Jones Analyzed-by: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7..869116c2afb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -446,9 +446,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = iget_locked(sb, de->low_ino); + struct inode *inode = new_inode_pseudo(sb); - if (inode && (inode->i_state & I_NEW)) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; @@ -476,7 +477,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; -- cgit v1.2.3-70-g09d2 From e49dbbf3e770aa590a8a464ac4978a09027060b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2013 11:18:24 -0700 Subject: nfsd: fix bad offset use vfs_writev() updates the offset argument - but the code then passes the offset to vfs_fsync_range(). Since offset now points to the offset after what was just written, this is probably not what was intended Introduced by face15025ffdf664de95e86ae831544154d26c9c "nfsd: use vfs_fsync_range(), not O_SYNC, for stable writes". Signed-off-by: Kent Overstreet Cc: Al Viro Cc: "Eric W. Biederman" Cc: stable@vger.kernel.org Reviewed-by: Zach Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2a7eb536de0..2b2e2396a86 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; int stable = *stablep; int use_wgather; + loff_t pos = offset; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); set_fs(oldfs); if (host_err < 0) goto out_nfserr; -- cgit v1.2.3-70-g09d2 From 4adaa611020fa6ac65b0ac8db78276af4ec04e63 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 26 Mar 2013 13:07:00 -0400 Subject: Btrfs: fix race between mmap writes and compression Btrfs uses page_mkwrite to ensure stable pages during crc calculations and mmap workloads. We call clear_page_dirty_for_io before we do any crcs, and this forces any application with the file mapped to wait for the crc to finish before it is allowed to change the file. With compression on, the clear_page_dirty_for_io step is happening after we've compressed the pages. This means the applications might be changing the pages while we are compressing them, and some of those modifications might not hit the disk. This commit adds the clear_page_dirty_for_io before compression starts and makes sure to redirty the page if we have to fallback to uncompressed IO as well. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva cc: stable@vger.kernel.org --- fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/inode.c | 14 ++++++++++++++ 3 files changed, 49 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af646..cdee391fc7b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a198556..258c9215685 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f26888825e..6a6e13c5308 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -353,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -415,6 +416,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; -- cgit v1.2.3-70-g09d2 From 7ea600b5314529f9d1b9d6d3c41cb26fce6a7a4a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2013 18:25:57 -0400 Subject: Nest rename_lock inside vfsmount_lock ... lest we get livelocks between path_is_under() and d_path() and friends. The thing is, wrt fairness lglocks are more similar to rwsems than to rwlocks; it is possible to have thread B spin on attempt to take lock shared while thread A is already holding it shared, if B is on lower-numbered CPU than A and there's a thread C spinning on attempt to take the same lock exclusive. As the result, we need consistent ordering between vfsmount_lock (lglock) and rename_lock (seq_lock), even though everything that takes both is going to take vfsmount_lock only shared. Spotted-by: Brad Spengler Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index fbfae008ba4..e8bc3420d63 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2542,7 +2542,6 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2572,8 +2571,6 @@ static int prepend_path(const struct path *path, if (!error && !slash) error = prepend(buffer, buflen, "/", 1); -out: - br_read_unlock(&vfsmount_lock); return error; global_root: @@ -2590,7 +2587,7 @@ global_root: error = prepend(buffer, buflen, "/", 1); if (!error) error = is_mounted(vfsmnt) ? 1 : 2; - goto out; + return error; } /** @@ -2617,9 +2614,11 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2636,9 +2635,11 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -2702,11 +2703,13 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); - write_sequnlock(&rename_lock); path_put(&root); return res; } @@ -2830,6 +2833,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2839,6 +2843,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) goto out; @@ -2859,6 +2864,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); } out: -- cgit v1.2.3-70-g09d2 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb4..a3035223d42 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24..2b93a9a5a1e 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d34204..0f1e4288457 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3-70-g09d2 From 90563b198e4c6674c63672fae1923da467215f45 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 03:10:15 -0700 Subject: vfs: Add a mount flag to lock read only bind mounts When a read-only bind mount is copied from mount namespace in a higher privileged user namespace to a mount namespace in a lesser privileged user namespace, it should not be possible to remove the the read-only restriction. Add a MNT_LOCK_READONLY mount flag to indicate that a mount must remain read-only. CC: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 +++ include/linux/mount.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index a3035223d42..8505b5ece5d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1713,6 +1713,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else diff --git a/include/linux/mount.h b/include/linux/mount.h index d7029f4a191..73005f9957e 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -47,6 +47,8 @@ struct mnt_namespace; #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_READONLY 0x400000 + struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ -- cgit v1.2.3-70-g09d2 From 132c94e31b8bca8ea921f9f96a57d684fa4ae0a9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 04:08:05 -0700 Subject: vfs: Carefully propogate mounts across user namespaces As a matter of policy MNT_READONLY should not be changable if the original mounter had more privileges than creator of the mount namespace. Add the flag CL_UNPRIVILEGED to note when we are copying a mount from a mount namespace that requires more privileges to a mount namespace that requires fewer privileges. When the CL_UNPRIVILEGED flag is set cause clone_mnt to set MNT_NO_REMOUNT if any of the mnt flags that should never be changed are set. This protects both mount propagation and the initial creation of a less privileged mount namespace. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 +++++- fs/pnode.c | 6 ++++++ fs/pnode.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 8505b5ece5d..968d4c5eae0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -2342,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0..8b29d2164da 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" #include "pnode.h" @@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; @@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445..a0493d5ebfb 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { -- cgit v1.2.3-70-g09d2 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae0..d581e45c0a9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26ba..9c7fab1d23f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec73..afd83273e6c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce00932493..b6b215f13b4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03..8e635a18ab5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e4288457..a54f26f82eb 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3-70-g09d2 From 3e84f48edfd33b2e209a117c11fb9ce637cc9b67 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Mar 2013 15:20:30 +0000 Subject: vfs/splice: Fix missed checks in new __kernel_write() helper Commit 06ae43f34bcc ("Don't bother with redoing rw_verify_area() from default_file_splice_from()") lost the checks to test existence of the write/aio_write methods. My apologies ;-/ Eventually, we want that in fs/splice.c side of things (no point repeating it for every buffer, after all), but for now this is the obvious minimal fix. Reported-by: Dave Jones Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/read_write.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index f7b5a23b804..e6ddc8dceb9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -424,6 +424,9 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t const char __user *p; ssize_t ret; + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + old_fs = get_fs(); set_fs(get_ds()); p = (__force const char __user *)buf; -- cgit v1.2.3-70-g09d2 From adaa4b8e4d47eeb114513c2f7a172929154b94bd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Mar 2013 14:30:23 +0000 Subject: Btrfs: fix EIO from btrfs send in is_extent_unchanged for punched holes When you take a snapshot, punch a hole where there has been data, then take another snapshot and try to send an incremental stream, btrfs send would give you EIO. That is because is_extent_unchanged had no support for holes being punched. With this patch, instead of returning EIO we just return 0 (== the extent is not unchanged) and we're good. Signed-off-by: Jan Schmidt Cc: Alexander Block Signed-off-by: Josef Bacik --- fs/btrfs/send.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 68da757615a..ed897dc1135 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } -- cgit v1.2.3-70-g09d2 From f4881bc7a83eff263789dd524b7c269d138d4af5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Mar 2013 16:03:35 -0400 Subject: Btrfs: fix space leak when we fail to reserve metadata space Dave reported a warning when running xfstest 275. We have been leaking delalloc metadata space when our reservations fail. This is because we were improperly calculating how much space to free for our checksum reservations. The problem is we would sometimes free up space that had already been freed in another thread and we would end up with negative usage for the delalloc space. This patch fixes the problem by calculating how much space the other threads would have already freed, and then calculate how much space we need to free had we not done the reservation at all, and then freeing any excess space. This makes xfstests 275 no longer have leaked space. Thanks Cc: stable@vger.kernel.org Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8ff25aedca..a22b5cc921a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4815,14 +4815,49 @@ out_fail: * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); -- cgit v1.2.3-70-g09d2 From 6e137ed3f30574f314733d4b7a86ea6523232b14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:26:55 -0400 Subject: Btrfs: fix space accounting for unlink and rename We are way over-reserving for unlink and rename. Rename is just some random huge number and unlink accounts for tree log operations that don't actually happen during unlink, not to mention the tree log doesn't take from the trans block rsv anyway so it's completely useless. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6a6e13c5308..8cab424c75f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3693,11 +3693,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -8141,7 +8139,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; -- cgit v1.2.3-70-g09d2 From db1d607d3ca5cbb283cbb17d648cd7e8dc67cc7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:29:11 -0400 Subject: Btrfs: hold the ordered operations mutex when waiting on ordered extents We need to hold the ordered_operations mutex while waiting on ordered extents since we splice and run the ordered extents list. We need to make sure anybody else who wants to wait on ordered extents does actually wait for them to be completed. This will keep us from bailing out of flushing in case somebody is already waiting on ordered extents to complete. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717..005c45db699 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* -- cgit v1.2.3-70-g09d2 From fdf30d1c1b386e1b73116cc7e0fb14e962b763b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:31:45 -0400 Subject: Btrfs: limit the global reserve to 512mb A user reported a problem where he was getting early ENOSPC with hundreds of gigs of free data space and 6 gigs of free metadata space. This is because the global block reserve was taking up the entire free metadata space. This is ridiculous, we have infrastructure in place to throttle if we start using too much of the global reserve, so instead of letting it get this huge just limit it to 512mb so that users can still get work done. This allowed the user to complete his rsync without issues. Thanks Cc: stable@vger.kernel.org Reported-and-tested-by: Stefan Priebe Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a22b5cc921a..0d8478700d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4460,7 +4460,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + -- cgit v1.2.3-70-g09d2 From a7975026ff9ddf91ba190ae2b71699dd156395e3 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 25 Mar 2013 11:08:23 +0000 Subject: Btrfs: fix double free in the btrfs_qgroup_account_ref() The function btrfs_find_all_roots is responsible to allocate memory for 'roots' and free it if errors happen,so the caller should not free it again since the work has been done. Besides,'tmp' is allocated after the function btrfs_find_all_roots, so we can return directly if btrfs_find_all_roots() fails. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Reviewed-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5471e47d655..b44124dd237 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); -- cgit v1.2.3-70-g09d2 From 39847c4d3d91f487f9ab3d083ee5d0f8419f105c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:08:20 +0000 Subject: Btrfs: fix wrong reservation of csums We reserve the space for csums only when we write data into a file, in the other cases, such as tree log, log replay, we don't do reservation, so we can use the reservation of the transaction handle just for the former. And for the latter, we should use the tree's own reservation. But the function - btrfs_csum_file_blocks() didn't differentiate between these two types of the cases, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 2 -- fs/btrfs/inode.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3..b7e529d2860 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -728,7 +728,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +898,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cab424c75f..b88381582da 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1757,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } -- cgit v1.2.3-70-g09d2 From 82d130ff390be67d980d8b6f39e921c0b1d8d8e0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:12:15 +0000 Subject: Btrfs: fix wrong return value of btrfs_lookup_csum() If we don't find the expected csum item, but find a csum item which is adjacent to the specified extent, we should return -EFBIG, or we should return -ENOENT. But btrfs_lookup_csum() return -EFBIG even the csum item is not adjacent to the specified extent. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b7e529d2860..c4628a201cb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); -- cgit v1.2.3-70-g09d2 From d8fe29e9dea8d7d61fd140d8779326856478fc62 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 29 Mar 2013 08:09:34 -0600 Subject: Btrfs: don't drop path when printing out tree errors in scrub A user reported a panic where we were panicing somewhere in tree_backref_for_extent from scrub_print_warning. He only captured the trace but looking at scrub_print_warning we drop the path right before we mess with the extent buffer to print out a bunch of stuff, which isn't right. So fix this by dropping the path after we use the eb if we need to. Thanks, Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4c..85e072b956d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, -- cgit v1.2.3-70-g09d2