From eaae668d01e15435cf977cced3975ccc436257fc Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 15 Feb 2011 12:48:09 +0000 Subject: fs/inode: Fix kernel-doc format for inode_init_owner Signed-off-by: Ben Hutchings Signed-off-by: Al Viro --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 9910c039f02..16fefd373fc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1715,7 +1715,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) EXPORT_SYMBOL(init_special_inode); /** - * Init uid,gid,mode for new inode according to posix standards + * inode_init_owner - Init uid,gid,mode for new inode according to posix standards * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode -- cgit v1.2.3-70-g09d2 From e795b71799ff0b27365020c9ddaa25d0d83f99c8 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:25 -0700 Subject: userns: userns: check user namespace for task->file uid equivalence checks Cheat for now and say all files belong to init_user_ns. Next step will be to let superblocks belong to a user_ns, and derive inode_userns(inode) from inode->i_sb->s_user_ns. Finally we'll introduce more flexible arrangements. Changelog: Feb 15: make is_owner_or_cap take const struct inode Feb 23: make is_owner_or_cap bool [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 17 +++++++++++++++++ fs/namei.c | 21 ++++++++++++++++----- include/linux/fs.h | 9 +++++++-- 3 files changed, 40 insertions(+), 7 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 16fefd373fc..a21d5a938a1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -1733,3 +1734,19 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); + +/* + * return true if current either has CAP_FOWNER to the + * file, or owns the file. + */ +bool is_owner_or_cap(const struct inode *inode) +{ + struct user_namespace *ns = inode_userns(inode); + + if (current_user_ns() == ns && current_fsuid() == inode->i_uid) + return true; + if (ns_capable(ns, CAP_FOWNER)) + return true; + return false; +} +EXPORT_SYMBOL(is_owner_or_cap); diff --git a/fs/namei.c b/fs/namei.c index 5a9a6c3094d..dbb45a652ae 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -183,6 +183,9 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + if (current_user_ns() != inode_userns(inode)) + goto other_perms; + if (current_fsuid() == inode->i_uid) mode >>= 6; else { @@ -196,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag mode >>= 3; } +other_perms: /* * If the DACs are ok we don't need any capability check. */ @@ -237,7 +241,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, * Executable DACs are overridable if at least one exec bit is set. */ if (!(mask & MAY_EXEC) || execute_ok(inode)) - if (capable(CAP_DAC_OVERRIDE)) + if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) return 0; /* @@ -245,7 +249,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) - if (capable(CAP_DAC_READ_SEARCH)) + if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -654,6 +658,7 @@ static inline int handle_reval_path(struct nameidata *nd) static inline int exec_permission(struct inode *inode, unsigned int flags) { int ret; + struct user_namespace *ns = inode_userns(inode); if (inode->i_op->permission) { ret = inode->i_op->permission(inode, MAY_EXEC, flags); @@ -666,7 +671,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags) if (ret == -ECHILD) return ret; - if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) + if (ns_capable(ns, CAP_DAC_OVERRIDE) || + ns_capable(ns, CAP_DAC_READ_SEARCH)) goto ok; return ret; @@ -1842,11 +1848,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) if (!(dir->i_mode & S_ISVTX)) return 0; + if (current_user_ns() != inode_userns(inode)) + goto other_userns; if (inode->i_uid == fsuid) return 0; if (dir->i_uid == fsuid) return 0; - return !capable(CAP_FOWNER); + +other_userns: + return !ns_capable(inode_userns(inode), CAP_FOWNER); } /* @@ -2440,7 +2450,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && + !ns_capable(inode_userns(dir), CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/include/linux/fs.h b/include/linux/fs.h index 12529e96635..9eebc646d14 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1457,8 +1457,13 @@ enum { #define put_fs_excl() atomic_dec(¤t->fs_excl) #define has_fs_excl() atomic_read(¤t->fs_excl) -#define is_owner_or_cap(inode) \ - ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER)) +/* + * until VFS tracks user namespaces for inodes, just make all files + * belong to init_user_ns + */ +extern struct user_namespace init_user_ns; +#define inode_userns(inode) (&init_user_ns) +extern bool is_owner_or_cap(const struct inode *inode); /* not quite ready to be deprecated, but... */ extern void lock_super(struct super_block *); -- cgit v1.2.3-70-g09d2 From 2e1496707560ecf98e9b0604622c0990f94861d3 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:26 -0700 Subject: userns: rename is_owner_or_cap to inode_owner_or_capable And give it a kernel-doc comment. [akpm@linux-foundation.org: btrfs changed in linux-next] Signed-off-by: Serge E. Hallyn Cc: "Eric W. Biederman" Cc: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/acl.c | 2 +- fs/attr.c | 4 ++-- fs/btrfs/acl.c | 2 +- fs/btrfs/ioctl.c | 4 ++-- fs/ext2/acl.c | 2 +- fs/ext2/ioctl.c | 6 +++--- fs/ext3/acl.c | 2 +- fs/ext3/ioctl.c | 6 +++--- fs/ext4/acl.c | 2 +- fs/ext4/ioctl.c | 8 ++++---- fs/fcntl.c | 2 +- fs/generic_acl.c | 2 +- fs/gfs2/file.c | 2 +- fs/hfsplus/ioctl.c | 2 +- fs/inode.c | 13 ++++++++----- fs/jffs2/acl.c | 2 +- fs/jfs/ioctl.c | 2 +- fs/jfs/xattr.c | 2 +- fs/logfs/file.c | 2 +- fs/namei.c | 2 +- fs/nilfs2/ioctl.c | 2 +- fs/ocfs2/acl.c | 2 +- fs/ocfs2/ioctl.c | 2 +- fs/reiserfs/ioctl.c | 4 ++-- fs/reiserfs/xattr_acl.c | 2 +- fs/ubifs/ioctl.c | 2 +- fs/utimes.c | 2 +- fs/xattr.c | 2 +- include/linux/fs.h | 2 +- security/selinux/hooks.c | 2 +- 30 files changed, 47 insertions(+), 44 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 33aa116732c..535ab6eccb1 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -323,7 +323,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { /* update the cached acl value */ diff --git a/fs/attr.c b/fs/attr.c index 7ca41811afa..1007ed61631 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -59,7 +59,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : @@ -69,7 +69,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9c949348510..de34bfad9ec 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -170,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, int ret; struct posix_acl *acl = NULL; - if (!is_owner_or_cap(dentry->d_inode)) + if (!inode_owner_or_capable(dentry->d_inode)) return -EPERM; if (!IS_POSIXACL(dentry->d_inode)) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5fdb2abc4fa..d1bace3df9b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -158,7 +158,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) FS_SYNC_FL | FS_DIRSYNC_FL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; mutex_lock(&inode->i_mutex); @@ -1077,7 +1077,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (flags & ~BTRFS_SUBVOL_RDONLY) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; down_write(&root->fs_info->subvol_sem); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 7b4180554a6..abea5a17c76 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -406,7 +406,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EINVAL; if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(dentry->d_inode)) + if (!inode_owner_or_capable(dentry->d_inode)) return -EPERM; if (value) { diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index e7431309bdc..f81e250ac5c 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) return ret; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { ret = -EACCES; goto setflags_out; } @@ -89,7 +89,7 @@ setflags_out: case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); case EXT2_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; ret = mnt_want_write(filp->f_path.mnt); if (ret) @@ -115,7 +115,7 @@ setflags_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e4fa49e6c53..9d021c0d472 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -435,7 +435,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index fc080dd561f..f4090bd2f34 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -38,7 +38,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int oldflags; unsigned int jflag; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -123,7 +123,7 @@ flags_out: __u32 generation; int err; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; err = mnt_want_write(filp->f_path.mnt); @@ -192,7 +192,7 @@ setversion_out: if (err) return err; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { err = -EACCES; goto setrsvsz_out; } diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index e0270d1f8d8..21eacd7b7d7 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eb3bc2fe647..a84faa110bc 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int oldflags; unsigned int jflag; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -146,7 +146,7 @@ flags_out: __u32 generation; int err; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; err = mnt_want_write(filp->f_path.mnt); @@ -298,7 +298,7 @@ mext_out: case EXT4_IOC_MIGRATE: { int err; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; err = mnt_want_write(filp->f_path.mnt); @@ -320,7 +320,7 @@ mext_out: case EXT4_IOC_ALLOC_DA_BLKS: { int err; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; err = mnt_want_write(filp->f_path.mnt); diff --git a/fs/fcntl.c b/fs/fcntl.c index 6c82e5bac03..22764c7c838 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -159,7 +159,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 06c48a89183..8f26d1a5891 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -74,7 +74,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, return -EINVAL; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { acl = posix_acl_from_xattr(value, size); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4074b952b05..b2682e073ee 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -221,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) goto out_drop_write; error = -EACCES; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) goto out; error = 0; diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 508ce662ce1..fbaa6690c8e 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -47,7 +47,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) if (err) goto out; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { err = -EACCES; goto out_drop_write; } diff --git a/fs/inode.c b/fs/inode.c index a21d5a938a1..0b3da4a7770 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1735,11 +1735,14 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, } EXPORT_SYMBOL(inode_init_owner); -/* - * return true if current either has CAP_FOWNER to the - * file, or owns the file. +/** + * inode_owner_or_capable - check current task permissions to inode + * @inode: inode being checked + * + * Return true if current either has CAP_FOWNER to the inode, or + * owns the file. */ -bool is_owner_or_cap(const struct inode *inode) +bool inode_owner_or_capable(const struct inode *inode) { struct user_namespace *ns = inode_userns(inode); @@ -1749,4 +1752,4 @@ bool is_owner_or_cap(const struct inode *inode) return true; return false; } -EXPORT_SYMBOL(is_owner_or_cap); +EXPORT_SYMBOL(inode_owner_or_capable); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 95b79672150..828a0e1ea43 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -402,7 +402,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, if (name[0] != '\0') return -EINVAL; - if (!is_owner_or_cap(dentry->d_inode)) + if (!inode_owner_or_capable(dentry->d_inode)) return -EPERM; if (value) { diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index afe222bf300..6f98a186677 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -72,7 +72,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { err = -EACCES; goto setflags_out; } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 3fa4c32272d..24838f1eeee 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -678,7 +678,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, struct posix_acl *acl; int rc; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; /* diff --git a/fs/logfs/file.c b/fs/logfs/file.c index e86376b87af..c2ad7028def 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -196,7 +196,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (IS_RDONLY(inode)) return -EROFS; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; err = get_user(flags, (int __user *)arg); diff --git a/fs/namei.c b/fs/namei.c index dbb45a652ae..fc858b1124c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2036,7 +2036,7 @@ static int may_open(struct path *path, int acc_mode, int flag) } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !is_owner_or_cap(inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(inode)) return -EPERM; /* diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 95c04c2f2b3..f2469ba6246 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -113,7 +113,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, unsigned int flags, oldflags; int ret; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *)argp)) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 704f6b1742f..90f2729b7a5 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -497,7 +497,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7a486819615..09de77ce002 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -82,7 +82,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, } status = -EACCES; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) goto bail_unlock; if (!S_ISDIR(inode->i_mode)) diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 79265fdc317..4e153051bc7 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) break; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { err = -EPERM; goto setflags_out; } @@ -103,7 +103,7 @@ setflags_out: err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { err = -EPERM; break; } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 90d2fcb67a3..3dc38f1206f 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -26,7 +26,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, size_t jcreate_blocks; if (!reiserfs_posixacl(inode->i_sb)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 8aacd64957a..548acf494af 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -160,7 +160,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (IS_RDONLY(inode)) return -EROFS; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) diff --git a/fs/utimes.c b/fs/utimes.c index 179b5869065..ba653f3dc1b 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -95,7 +95,7 @@ static int utimes_common(struct path *path, struct timespec *times) if (IS_IMMUTABLE(inode)) goto mnt_drop_write_and_out; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { error = inode_permission(inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; diff --git a/fs/xattr.c b/fs/xattr.c index 01bb8135e14..a19acdb81cd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -59,7 +59,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -EPERM; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && !is_owner_or_cap(inode)) + (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) return -EPERM; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eebc646d14..4dda076c24a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1463,7 +1463,7 @@ enum { */ extern struct user_namespace init_user_ns; #define inode_userns(inode) (&init_user_ns) -extern bool is_owner_or_cap(const struct inode *inode); +extern bool inode_owner_or_capable(const struct inode *inode); /* not quite ready to be deprecated, but... */ extern void lock_super(struct super_block *); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c67f863d354..f9c3764e485 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2725,7 +2725,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name, if (!(sbsec->flags & SE_SBLABELSUPP)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; COMMON_AUDIT_DATA_INIT(&ad, FS); -- cgit v1.2.3-70-g09d2 From 250df6ed274d767da844a5d9f05720b804240197 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:36 +1100 Subject: fs: protect inode->i_state with inode->i_lock Protect inode state transitions and validity checks with the inode->i_lock. This enables us to make inode state transitions independently of the inode_lock and is the first step to peeling away the inode_lock from the code. This requires that __iget() is done atomically with i_state checks during list traversals so that we don't race with another thread marking the inode I_FREEING between the state check and grabbing the reference. Also remove the unlock_new_inode() memory barrier optimisation required to avoid taking the inode_lock when clearing I_NEW. Simplify the code by simply taking the inode->i_lock around the state change and wakeup. Because the wakeup is no longer tricky, remove the wake_up_inode() function and open code the wakeup where necessary. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/block_dev.c | 2 + fs/buffer.c | 2 +- fs/drop_caches.c | 9 ++- fs/fs-writeback.c | 44 ++++++++++---- fs/inode.c | 150 ++++++++++++++++++++++++++++++++--------------- fs/notify/inode_mark.c | 21 +++++-- fs/quota/dquot.c | 13 ++-- include/linux/fs.h | 2 +- include/linux/quotaops.h | 2 +- mm/filemap.c | 2 + mm/rmap.c | 1 + 11 files changed, 174 insertions(+), 74 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 88928701959..bc39b18cf3d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -56,9 +56,11 @@ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { spin_lock(&inode_lock); + spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); } diff --git a/fs/buffer.c b/fs/buffer.c index 2219a76e2ca..da666f3148f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1144,7 +1144,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and the global inode_lock. + * mapping->tree_lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 816f88e6b9c..6c6f73ba086 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -18,11 +18,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - if (inode->i_mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59c6e495678..efd1ebe879c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -306,10 +306,12 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); } } @@ -333,6 +335,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; + spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else @@ -348,6 +351,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { + spin_unlock(&inode->i_lock); requeue_io(inode); return 0; } @@ -363,6 +367,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -384,8 +389,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode_lock); + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { @@ -395,6 +402,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -436,6 +444,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); + spin_unlock(&inode->i_lock); return ret; } @@ -506,7 +515,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * kind does not need peridic writeout yet, and for the latter * kind writeout is handled by the freer. */ + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); requeue_io(inode); continue; } @@ -515,10 +526,14 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, wbc->wb_start)) + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); return 1; + } __iget(inode); + spin_unlock(&inode->i_lock); + pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); if (wbc->pages_skipped != pages_skipped) { @@ -724,7 +739,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); + spin_lock(&inode->i_lock); inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); } spin_unlock(&inode_lock); } @@ -1017,6 +1034,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) block_dump___mark_inode_dirty(inode); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1028,7 +1046,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * superblock list, based upon its state. */ if (inode->i_state & I_SYNC) - goto out; + goto out_unlock_inode; /* * Only add valid (hashed) inodes to the superblock's @@ -1036,11 +1054,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out; + goto out_unlock_inode; } if (inode->i_state & I_FREEING) - goto out; + goto out_unlock_inode; + spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). @@ -1065,7 +1084,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); } + goto out; } +out_unlock_inode: + spin_unlock(&inode->i_lock); out: spin_unlock(&inode_lock); @@ -1111,14 +1133,16 @@ static void wait_sb_inodes(struct super_block *sb) * we still have to wait for that writeout. */ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + struct address_space *mapping = inode->i_mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* * We hold a reference to 'inode' so it couldn't have diff --git a/fs/inode.c b/fs/inode.c index 0b3da4a7770..14b12c4ee02 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -27,6 +27,17 @@ #include #include +/* + * inode locking rules. + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * + * Lock ordering: + * inode_lock + * inode->i_lock + */ + /* * This is needed for the following functions: * - inode_has_buffers @@ -137,15 +148,6 @@ int proc_nr_inodes(ctl_table *table, int write, } #endif -static void wake_up_inode(struct inode *inode) -{ - /* - * Prevent speculative execution through spin_unlock(&inode_lock); - */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); -} - /** * inode_init_always - perform inode structure intialisation * @sb: superblock inode belongs to @@ -336,7 +338,7 @@ static void init_once(void *foo) } /* - * inode_lock must be held + * inode->i_lock must be held */ void __iget(struct inode *inode) { @@ -413,7 +415,9 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); hlist_add_head(&inode->i_hash, b); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); } EXPORT_SYMBOL(__insert_inode_hash); @@ -438,7 +442,9 @@ static void __remove_inode_hash(struct inode *inode) void remove_inode_hash(struct inode *inode) { spin_lock(&inode_lock); + spin_lock(&inode->i_lock); hlist_del_init(&inode->i_hash); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -495,7 +501,9 @@ static void dispose_list(struct list_head *head) __inode_sb_list_del(inode); spin_unlock(&inode_lock); - wake_up_inode(inode); + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); destroy_inode(inode); } } @@ -518,10 +526,17 @@ void evict_inodes(struct super_block *sb) list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } inode->i_state |= I_FREEING; + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + inodes_stat.nr_unused--; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -529,8 +544,6 @@ void evict_inodes(struct super_block *sb) */ list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -563,18 +576,26 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & I_DIRTY && !kill_dirty) { + spin_unlock(&inode->i_lock); busy = 1; continue; } if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); busy = 1; continue; } inode->i_state |= I_FREEING; + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + inodes_stat.nr_unused--; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -582,8 +603,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) */ list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -641,8 +660,10 @@ static void prune_icache(int nr_to_scan) * Referenced or dirty inodes are still in use. Give them * another pass through the LRU as we canot reclaim them now. */ + spin_lock(&inode->i_lock); if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { + spin_unlock(&inode->i_lock); list_del_init(&inode->i_lru); inodes_stat.nr_unused--; continue; @@ -650,12 +671,14 @@ static void prune_icache(int nr_to_scan) /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { - list_move(&inode->i_lru, &inode_lru); inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + list_move(&inode->i_lru, &inode_lru); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, @@ -666,11 +689,15 @@ static void prune_icache(int nr_to_scan) if (inode != list_entry(inode_lru.next, struct inode, i_lru)) continue; /* wrong inode or list_empty */ - if (!can_unuse(inode)) + spin_lock(&inode->i_lock); + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); continue; + } } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -737,11 +764,13 @@ repeat: continue; if (!test(inode, data)) continue; + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -763,11 +792,13 @@ repeat: continue; if (inode->i_sb != sb) continue; + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -832,14 +863,23 @@ struct inode *new_inode(struct super_block *sb) inode = alloc_inode(sb); if (inode) { spin_lock(&inode_lock); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = 0; + spin_unlock(&inode->i_lock); + __inode_sb_list_add(inode); spin_unlock(&inode_lock); } return inode; } EXPORT_SYMBOL(new_inode); +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -859,19 +899,11 @@ void unlock_new_inode(struct inode *inode) } } #endif - /* - * This is special! We do not need the spinlock when clearing I_NEW, - * because we're guaranteed that nobody else tries to do anything about - * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_NEW). - * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_NEW after the other inode initialisation has - * completed. - */ - smp_mb(); + spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; - wake_up_inode(inode); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); @@ -900,9 +932,11 @@ static struct inode *get_new_inode(struct super_block *sb, if (set(inode, data)) goto set_failed; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); __inode_sb_list_add(inode); - inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -947,9 +981,11 @@ static struct inode *get_new_inode_fast(struct super_block *sb, old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); __inode_sb_list_add(inode); - inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -1034,15 +1070,19 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); - else + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; + } spin_unlock(&inode_lock); return inode; } @@ -1271,7 +1311,6 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1281,16 +1320,23 @@ int insert_inode_locked(struct inode *inode) continue; if (old->i_sb != sb) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return 0; } __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { @@ -1308,8 +1354,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_NEW; - while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1320,16 +1364,23 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, continue; if (!test(old, data)) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return 0; } __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { @@ -1375,6 +1426,9 @@ static void iput_final(struct inode *inode) const struct super_operations *op = inode->i_sb->s_op; int drop; + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + if (op && op->drop_inode) drop = op->drop_inode(inode); else @@ -1386,21 +1440,23 @@ static void iput_final(struct inode *inode) if (!(inode->i_state & (I_DIRTY|I_SYNC))) { inode_lru_list_add(inode); } + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return; } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; __remove_inode_hash(inode); } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -1413,8 +1469,10 @@ static void iput_final(struct inode *inode) spin_unlock(&inode_lock); evict(inode); remove_inode_hash(inode); - wake_up_inode(inode); + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); destroy_inode(inode); } @@ -1611,9 +1669,8 @@ EXPORT_SYMBOL(inode_wait); * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to - * wake_up_inode() after removing from the hash list will DTRT. - * - * This is called with inode_lock held. + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode) { @@ -1621,6 +1678,7 @@ static void __wait_on_freeing_inode(struct inode *inode) DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); schedule(); finish_wait(wq, &wait.wait); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 4c29fcf557d..4dd53fb4412 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list) * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } /* * If i_count is zero, the inode cannot have any watches and @@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list) * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ - if (!atomic_read(&inode->i_count)) + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); continue; + } need_iput_tmp = need_iput; need_iput = NULL; @@ -274,13 +279,17 @@ void fsnotify_unmount_inodes(struct list_head *list) __iget(inode); else need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; + atomic_read(&next_i->i_count)) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); } /* diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a2a622e079f..a1470fda366 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -902,18 +902,19 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); continue; + } #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - if (!atomic_read(&inode->i_writecount)) - continue; - if (!dqinit_needed(inode, type)) - continue; - __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); iput(old_inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4dda076c24a..ed6fdcc1484 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1647,7 +1647,7 @@ struct super_operations { }; /* - * Inode state bits. Protected by inode_lock. + * Inode state bits. Protected by inode->i_lock * * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, * I_DIRTY_DATASYNC and I_DIRTY_PAGES. diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index eb354f6f26b..26f9e3612e0 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -277,7 +277,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly - * reduces inode_lock contention. + * reduces lock contention. */ mark_inode_dirty(inode); } diff --git a/mm/filemap.c b/mm/filemap.c index f807afda86f..499e9aa9145 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -99,7 +99,9 @@ * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * (code doesn't rely on that order, so you could switch it around) diff --git a/mm/rmap.c b/mm/rmap.c index 4a8e99a0fb9..7dada045644 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,6 +32,7 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * inode_lock (in set_page_dirty's __mark_inode_dirty) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, -- cgit v1.2.3-70-g09d2 From b2b2af8e614b4dcd8aca1369d82ce5ad0461a7b1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:37 +1100 Subject: fs: factor inode disposal We have a couple of places that dispose of inodes. factor the disposal into evict() to isolate this code and make it simpler to peel away the inode_lock from the code. While doing this, change the logic flow in iput_final() to separate the different cases that need to be handled to make the transitions the inode goes through more obvious. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/inode.c | 104 ++++++++++++++++++++++++------------------------------------- 1 file changed, 41 insertions(+), 63 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 14b12c4ee02..f752a959254 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -422,17 +422,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) } EXPORT_SYMBOL(__insert_inode_hash); -/** - * __remove_inode_hash - remove an inode from the hash - * @inode: inode to unhash - * - * Remove an inode from the superblock. - */ -static void __remove_inode_hash(struct inode *inode) -{ - hlist_del_init(&inode->i_hash); -} - /** * remove_inode_hash - remove an inode from the hash * @inode: inode to unhash @@ -462,10 +451,31 @@ void end_writeback(struct inode *inode) } EXPORT_SYMBOL(end_writeback); +/* + * Free the inode passed in, removing it from the lists it is still connected + * to. We remove any pages still attached to the inode and wait for any IO that + * is still in progress before finally destroying the inode. + * + * An inode must already be marked I_FREEING so that we avoid the inode being + * moved back onto lists if we race with other code that manipulates the lists + * (e.g. writeback_single_inode). The caller is responsible for setting this. + * + * An inode must already be removed from the LRU list before being evicted from + * the cache. This should occur atomically with setting the I_FREEING state + * flag, so no inodes here should ever be on the LRU when being evicted. + */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!list_empty(&inode->i_lru)); + + spin_lock(&inode_lock); + list_del_init(&inode->i_wb_list); + __inode_sb_list_del(inode); + spin_unlock(&inode_lock); + if (op->evict_inode) { op->evict_inode(inode); } else { @@ -477,6 +487,15 @@ static void evict(struct inode *inode) bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); + + remove_inode_hash(inode); + + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); + + destroy_inode(inode); } /* @@ -495,16 +514,6 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); - - spin_lock(&inode_lock); - __remove_inode_hash(inode); - __inode_sb_list_del(inode); - spin_unlock(&inode_lock); - - spin_lock(&inode->i_lock); - wake_up_bit(&inode->i_state, __I_NEW); - spin_unlock(&inode->i_lock); - destroy_inode(inode); } } @@ -537,13 +546,7 @@ void evict_inodes(struct super_block *sb) if (!(inode->i_state & (I_DIRTY | I_SYNC))) inodes_stat.nr_unused--; spin_unlock(&inode->i_lock); - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ list_move(&inode->i_lru, &dispose); - list_del_init(&inode->i_wb_list); } spin_unlock(&inode_lock); @@ -596,13 +599,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) if (!(inode->i_state & (I_DIRTY | I_SYNC))) inodes_stat.nr_unused--; spin_unlock(&inode->i_lock); - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ list_move(&inode->i_lru, &dispose); - list_del_init(&inode->i_wb_list); } spin_unlock(&inode_lock); @@ -699,12 +696,7 @@ static void prune_icache(int nr_to_scan) inode->i_state |= I_FREEING; spin_unlock(&inode->i_lock); - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ list_move(&inode->i_lru, &freeable); - list_del_init(&inode->i_wb_list); inodes_stat.nr_unused--; } if (current_is_kswapd()) @@ -1434,16 +1426,16 @@ static void iput_final(struct inode *inode) else drop = generic_drop_inode(inode); + if (!drop && (sb->s_flags & MS_ACTIVE)) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) + inode_lru_list_add(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_lock); + return; + } + if (!drop) { - if (sb->s_flags & MS_ACTIVE) { - inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) { - inode_lru_list_add(inode); - } - spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); - return; - } inode->i_state |= I_WILL_FREE; spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); @@ -1452,28 +1444,14 @@ static void iput_final(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - __remove_inode_hash(inode); } inode->i_state |= I_FREEING; - spin_unlock(&inode->i_lock); - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ inode_lru_list_del(inode); - list_del_init(&inode->i_wb_list); - - __inode_sb_list_del(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); + evict(inode); - remove_inode_hash(inode); - spin_lock(&inode->i_lock); - wake_up_bit(&inode->i_state, __I_NEW); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - spin_unlock(&inode->i_lock); - destroy_inode(inode); } /** -- cgit v1.2.3-70-g09d2 From 02afc410f363f98ac4f186341e38dcec13fc0e60 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:38 +1100 Subject: fs: Lock the inode LRU list separately Introduce the inode_lru_lock to protect the inode_lru list. This lock is nested inside the inode->i_lock to allow the inode to be added to the LRU list in iput_final without needing to deal with lock inversions. This keeps iput_final() clean and neat. Further, where marking the inode I_FREEING and removing it from the LRU, move the LRU list manipulation within the inode->i_lock to keep the list manipulation consistent with iput_final. This also means that most of the open coded LRU list removal + unused inode accounting can now use the inode_lru_list_del() wrappers which cleans the code up further. However, this locking change means what the LRU traversal in prune_icache() inverts this lock ordering and needs to use trylock semantics on the inode->i_lock to avoid deadlocking. In these cases, if we fail to lock the inode we move it to the back of the LRU to prevent spinning on it. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/inode.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index f752a959254..b19cb6ee6ca 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -32,10 +32,13 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() + * inode_lru_lock protects: + * inode_lru, inode->i_lru * * Lock ordering: * inode_lock * inode->i_lock + * inode_lru_lock */ /* @@ -85,6 +88,7 @@ static unsigned int i_hash_shift __read_mostly; */ static LIST_HEAD(inode_lru); +static DEFINE_SPINLOCK(inode_lru_lock); static struct hlist_head *inode_hashtable __read_mostly; /* @@ -356,18 +360,22 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { + spin_lock(&inode_lru_lock); if (list_empty(&inode->i_lru)) { list_add(&inode->i_lru, &inode_lru); inodes_stat.nr_unused++; } + spin_unlock(&inode_lru_lock); } static void inode_lru_list_del(struct inode *inode) { + spin_lock(&inode_lru_lock); if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); inodes_stat.nr_unused--; } + spin_unlock(&inode_lru_lock); } static inline void __inode_sb_list_add(struct inode *inode) @@ -543,10 +551,9 @@ void evict_inodes(struct super_block *sb) } inode->i_state |= I_FREEING; - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); - list_move(&inode->i_lru, &dispose); + list_add(&inode->i_lru, &dispose); } spin_unlock(&inode_lock); @@ -596,10 +603,9 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) } inode->i_state |= I_FREEING; - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); - list_move(&inode->i_lru, &dispose); + list_add(&inode->i_lru, &dispose); } spin_unlock(&inode_lock); @@ -623,7 +629,7 @@ static int can_unuse(struct inode *inode) /* * Scan `goal' inodes on the unused list for freeable ones. They are moved to a - * temporary list and then are freed outside inode_lock by dispose_list(). + * temporary list and then are freed outside inode_lru_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -645,6 +651,7 @@ static void prune_icache(int nr_to_scan) down_read(&iprune_sem); spin_lock(&inode_lock); + spin_lock(&inode_lru_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; @@ -653,11 +660,20 @@ static void prune_icache(int nr_to_scan) inode = list_entry(inode_lru.prev, struct inode, i_lru); + /* + * we are inverting the inode_lru_lock/inode->i_lock here, + * so use a trylock. If we fail to get the lock, just move the + * inode to the back of the list so we don't spin on it. + */ + if (!spin_trylock(&inode->i_lock)) { + list_move(&inode->i_lru, &inode_lru); + continue; + } + /* * Referenced or dirty inodes are still in use. Give them * another pass through the LRU as we canot reclaim them now. */ - spin_lock(&inode->i_lock); if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { spin_unlock(&inode->i_lock); @@ -676,17 +692,21 @@ static void prune_icache(int nr_to_scan) if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); spin_unlock(&inode->i_lock); + spin_unlock(&inode_lru_lock); spin_unlock(&inode_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); spin_lock(&inode_lock); + spin_lock(&inode_lru_lock); if (inode != list_entry(inode_lru.next, struct inode, i_lru)) continue; /* wrong inode or list_empty */ - spin_lock(&inode->i_lock); + /* avoid lock inversions with trylock */ + if (!spin_trylock(&inode->i_lock)) + continue; if (!can_unuse(inode)) { spin_unlock(&inode->i_lock); continue; @@ -703,6 +723,7 @@ static void prune_icache(int nr_to_scan) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); + spin_unlock(&inode_lru_lock); spin_unlock(&inode_lock); dispose_list(&freeable); -- cgit v1.2.3-70-g09d2 From f283c86afe6aa70b733d1ecebad5d9464943b774 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:39 +1100 Subject: fs: remove inode_lock from iput_final and prune_icache Now that inode state changes are protected by the inode->i_lock and the inode LRU manipulations by the inode_lru_lock, we can remove the inode_lock from prune_icache and the initial part of iput_final(). instead of using the inode_lock to protect the inode during iput_final, use the inode->i_lock instead. This protects the inode against new references being taken while we change the inode state to I_FREEING, as well as preventing prune_icache from grabbing the inode while we are manipulating it. Hence we no longer need the inode_lock in iput_final prior to setting I_FREEING on the inode. For prune_icache, we no longer need the inode_lock to protect the LRU list, and the inodes themselves are protected against freeing races by the inode->i_lock. Hence we can lift the inode_lock from prune_icache as well. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/porting | 16 +++++++++++----- Documentation/filesystems/vfs.txt | 2 +- fs/inode.c | 17 +++-------------- fs/logfs/inode.c | 2 +- 5 files changed, 17 insertions(+), 22 deletions(-) (limited to 'fs/inode.c') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2e994efe12c..61b31acb917 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -128,7 +128,7 @@ alloc_inode: destroy_inode: dirty_inode: (must not sleep) write_inode: -drop_inode: !!!inode_lock!!! +drop_inode: !!!inode->i_lock!!! evict_inode: put_super: write write_super: read diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 0c986c9e851..6e29954851a 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -298,11 +298,14 @@ be used instead. It gets called whenever the inode is evicted, whether it has remaining links or not. Caller does *not* evict the pagecache or inode-associated metadata buffers; getting rid of those is responsibility of method, as it had been for ->delete_inode(). - ->drop_inode() returns int now; it's called on final iput() with inode_lock -held and it returns true if filesystems wants the inode to be dropped. As before, -generic_drop_inode() is still the default and it's been updated appropriately. -generic_delete_inode() is also alive and it consists simply of return 1. Note that -all actual eviction work is done by caller after ->drop_inode() returns. + + ->drop_inode() returns int now; it's called on final iput() with +inode->i_lock held and it returns true if filesystems wants the inode to be +dropped. As before, generic_drop_inode() is still the default and it's been +updated appropriately. generic_delete_inode() is also alive and it consists +simply of return 1. Note that all actual eviction work is done by caller after +->drop_inode() returns. + clear_inode() is gone; use end_writeback() instead. As before, it must be called exactly once on each call of ->evict_inode() (as it used to be for each call of ->delete_inode()). Unlike before, if you are using inode-associated @@ -395,6 +398,9 @@ Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set, so the i_size should not change when hole punching, even when puching the end of a file off. +-- +[mandatory] + -- [mandatory] ->get_sb() is gone. Switch to use of ->mount(). Typically it's just diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 306f0ae8df0..80815ed654c 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -254,7 +254,7 @@ or bottom half). should be synchronous or not, not all filesystems check this flag. drop_inode: called when the last access to the inode is dropped, - with the inode_lock spinlock held. + with the inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem semantics) or "generic_delete_inode" (for filesystems that do not diff --git a/fs/inode.c b/fs/inode.c index b19cb6ee6ca..389f5a24759 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -650,7 +650,6 @@ static void prune_icache(int nr_to_scan) unsigned long reap = 0; down_read(&iprune_sem); - spin_lock(&inode_lock); spin_lock(&inode_lru_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; @@ -676,8 +675,8 @@ static void prune_icache(int nr_to_scan) */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { - spin_unlock(&inode->i_lock); list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); inodes_stat.nr_unused--; continue; } @@ -685,20 +684,18 @@ static void prune_icache(int nr_to_scan) /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; - spin_unlock(&inode->i_lock); list_move(&inode->i_lru, &inode_lru); + spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&inode_lru_lock); - spin_unlock(&inode_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); - spin_lock(&inode_lock); spin_lock(&inode_lru_lock); if (inode != list_entry(inode_lru.next, @@ -724,7 +721,6 @@ static void prune_icache(int nr_to_scan) else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&inode_lru_lock); - spin_unlock(&inode_lock); dispose_list(&freeable); up_read(&iprune_sem); @@ -1082,7 +1078,6 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { - spin_lock(&inode_lock); spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); @@ -1096,7 +1091,6 @@ struct inode *igrab(struct inode *inode) */ inode = NULL; } - spin_unlock(&inode_lock); return inode; } EXPORT_SYMBOL(igrab); @@ -1439,7 +1433,6 @@ static void iput_final(struct inode *inode) const struct super_operations *op = inode->i_sb->s_op; int drop; - spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); if (op && op->drop_inode) @@ -1452,16 +1445,13 @@ static void iput_final(struct inode *inode) if (!(inode->i_state & (I_DIRTY|I_SYNC))) inode_lru_list_add(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); return; } if (!drop) { inode->i_state |= I_WILL_FREE; spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); write_inode_now(inode, 1); - spin_lock(&inode_lock); spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; @@ -1470,7 +1460,6 @@ static void iput_final(struct inode *inode) inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); evict(inode); } @@ -1489,7 +1478,7 @@ void iput(struct inode *inode) if (inode) { BUG_ON(inode->i_state & I_CLEAR); - if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) iput_final(inode); } } diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 03b8c240aed..edfea7a3a74 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) return ret; } -/* called with inode_lock held */ +/* called with inode->i_lock held */ static int logfs_drop_inode(struct inode *inode) { struct logfs_super *super = logfs_super(inode->i_sb); -- cgit v1.2.3-70-g09d2 From 55fa6091d83160ca772fc37cebae45d42695a708 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:40 +1100 Subject: fs: move i_sb_list out from under inode_lock Protect the per-sb inode list with a new global lock inode_sb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/drop_caches.c | 9 +++++---- fs/fs-writeback.c | 21 +++++++++++---------- fs/inode.c | 43 +++++++++++++++++++++++-------------------- fs/internal.h | 2 ++ fs/notify/inode_mark.c | 20 ++++++++++---------- fs/quota/dquot.c | 28 ++++++++++++++++------------ 6 files changed, 67 insertions(+), 56 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 6c6f73ba086..98b77c89494 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -8,6 +8,7 @@ #include #include #include +#include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; @@ -16,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -26,13 +27,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(toput_inode); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index efd1ebe879c..5de56a2182b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1123,7 +1123,7 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -1143,14 +1143,15 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. */ iput(old_inode); old_inode = inode; @@ -1159,9 +1160,9 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); } diff --git a/fs/inode.c b/fs/inode.c index 389f5a24759..785b1ab23ff 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -34,10 +34,15 @@ * inode->i_state, inode->i_hash, __iget() * inode_lru_lock protects: * inode_lru, inode->i_lru + * inode_sb_list_lock protects: + * sb->s_inodes, inode->i_sb_list * * Lock ordering: * inode_lock * inode->i_lock + * + * inode_sb_list_lock + * inode->i_lock * inode_lru_lock */ @@ -99,6 +104,8 @@ static struct hlist_head *inode_hashtable __read_mostly; */ DEFINE_SPINLOCK(inode_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); + /* * iprune_sem provides exclusion between the icache shrinking and the * umount path. @@ -378,26 +385,23 @@ static void inode_lru_list_del(struct inode *inode) spin_unlock(&inode_lru_lock); } -static inline void __inode_sb_list_add(struct inode *inode) -{ - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); -} - /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_lock); - __inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_lock(&inode_sb_list_lock); + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_sb_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); -static inline void __inode_sb_list_del(struct inode *inode) +static inline void inode_sb_list_del(struct inode *inode) { + spin_lock(&inode_sb_list_lock); list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -481,9 +485,10 @@ static void evict(struct inode *inode) spin_lock(&inode_lock); list_del_init(&inode->i_wb_list); - __inode_sb_list_del(inode); spin_unlock(&inode_lock); + inode_sb_list_del(inode); + if (op->evict_inode) { op->evict_inode(inode); } else { @@ -539,7 +544,7 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -555,7 +560,7 @@ void evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); @@ -584,7 +589,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { @@ -607,7 +612,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); @@ -867,16 +872,14 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_lock); + spin_lock_prefetch(&inode_sb_list_lock); inode = alloc_inode(sb); if (inode) { - spin_lock(&inode_lock); spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); - __inode_sb_list_add(inode); - spin_unlock(&inode_lock); + inode_sb_list_add(inode); } return inode; } @@ -945,7 +948,7 @@ static struct inode *get_new_inode(struct super_block *sb, inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); - __inode_sb_list_add(inode); + inode_sb_list_add(inode); spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -994,7 +997,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); - __inode_sb_list_add(inode); + inode_sb_list_add(inode); spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the diff --git a/fs/internal.h b/fs/internal.h index 8318059b42c..7013ae0c88c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,6 +125,8 @@ extern long do_handle_open(int mountdirfd, /* * inode.c */ +extern spinlock_t inode_sb_list_lock; + extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 4dd53fb4412..fb3b3c5ef0e 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -29,6 +29,8 @@ #include #include "fsnotify.h" +#include "../internal.h" + /* * Recalculate the mask of events relevant to a given inode locked. */ @@ -237,15 +239,14 @@ out: * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @list: list of inodes being unmounted (sb->s_inodes) * - * Called with inode_lock held, protecting the unmounting super block's list - * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. - * We temporarily drop inode_lock, however, and CAN block. + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. */ void fsnotify_unmount_inodes(struct list_head *list) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next_i, list, i_sb_list) { struct inode *need_iput_tmp; @@ -293,12 +294,11 @@ void fsnotify_unmount_inodes(struct list_head *list) } /* - * We can safely drop inode_lock here because we hold + * We can safely drop inode_sb_list_lock here because we hold * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. Finally, - * iprune_mutex keeps shrink_icache_memory() away. + * will be added since the umount has begun. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -310,7 +310,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a1470fda366..fcc8ae75d87 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -76,7 +76,7 @@ #include #include #include -#include /* for inode_lock, oddly enough.. */ +#include "../internal.h" /* ugh */ #include @@ -900,7 +900,7 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -915,19 +915,23 @@ static void add_dquot_ref(struct super_block *sb, int type) #endif __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); __dquot_initialize(inode, type); - /* We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the inode_lock. - * We cannot iput the inode now as we can be holding the last - * reference and we cannot iput it under inode_lock. So we - * keep the reference and iput it later. */ + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ old_inode = inode; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1008,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1022,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, remove_inode_dquot_ref(inode, type, tofree_head); } } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" -- cgit v1.2.3-70-g09d2 From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:41 +1100 Subject: fs: move i_wb_list out from under inode_lock Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/block_dev.c | 4 +-- fs/fs-writeback.c | 76 +++++++++++++++++++++++++++-------------------- fs/inode.c | 12 +++++--- fs/internal.h | 5 ++++ include/linux/writeback.h | 1 + mm/backing-dev.c | 8 ++--- mm/filemap.c | 8 ++--- mm/rmap.c | 4 +-- 8 files changed, 70 insertions(+), 48 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index bc39b18cf3d..2bbc0e62102 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -55,13 +55,13 @@ EXPORT_SYMBOL(I_BDEV); static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5de56a2182b..ed800656356 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) spin_unlock_bh(&bdi->wb_lock); } +/* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); */ + smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + assert_spin_locked(&inode_wb_list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_lock. Either the - * caller has ref on the inode (either via __iget or via syscall against an fd) - * or the inode has I_WILL_FREE set (via generic_forget_inode) + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either + * the caller has an active reference on the inode or the inode has I_WILL_FREE + * set. * * If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. - * - * Called under inode_lock. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); ret = do_writepages(mapping, wbc); @@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * due to delalloc, clear dirty metadata flags right before * write_inode() */ - spin_lock(&inode_lock); spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ } @@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } /* @@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); @@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb, inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } return wrote; @@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; - bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode->i_state & I_FREEING) goto out_unlock_inode; - spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + bool wakeup_bdi = false; bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { @@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; } - goto out; } out_unlock_inode: spin_unlock(&inode->i_lock); -out: - spin_unlock(&inode_lock); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 785b1ab23ff..239fdc08719 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include "internal.h" /* * inode locking rules. @@ -36,6 +37,8 @@ * inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list + * inode_wb_list_lock protects: + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * * Lock ordering: * inode_lock @@ -44,6 +47,9 @@ * inode_sb_list_lock * inode->i_lock * inode_lru_lock + * + * inode_wb_list_lock + * inode->i_lock */ /* @@ -105,6 +111,7 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the @@ -483,10 +490,7 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - spin_lock(&inode_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&inode_lock); - + inode_wb_list_del(inode); inode_sb_list_del(inode); if (op->evict_inode) { diff --git a/fs/internal.h b/fs/internal.h index 7013ae0c88c..b29c46e4e32 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -127,6 +127,11 @@ extern long do_handle_open(int mountdirfd, */ extern spinlock_t inode_sb_list_lock; +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0ead399e08b..3f5fee71832 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -10,6 +10,7 @@ struct backing_dev_info; extern spinlock_t inode_lock; +extern spinlock_t inode_wb_list_lock; /* * fs/fs-writeback.c diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d3022..4b3e9f17ee2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -73,14 +73,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -682,11 +682,11 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index 499e9aa9145..d8b34d1a107 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,8 +80,8 @@ * ->i_mutex * ->i_alloc_sem (various) * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * inode_wb_list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock @@ -98,9 +98,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * inode_wb_list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff --git a/mm/rmap.c b/mm/rmap.c index 7dada045644..8da044a1db0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,12 +31,12 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within inode_wb_list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock -- cgit v1.2.3-70-g09d2 From 67a23c494621ff1d5431c3bc320947865b224625 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:42 +1100 Subject: fs: rename inode_lock to inode_hash_lock All that remains of the inode_lock is protecting the inode hash list manipulation and traversals. Rename the inode_lock to inode_hash_lock to reflect it's actual function. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/inode.c | 111 +++++++++++++++++++++++++--------------------- fs/notify/inode_mark.c | 1 - fs/notify/mark.c | 1 - fs/notify/vfsmount_mark.c | 1 - fs/ntfs/inode.c | 4 +- include/linux/writeback.h | 1 - 6 files changed, 63 insertions(+), 56 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 239fdc08719..f9ee4928358 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -39,10 +39,10 @@ * sb->s_inodes, inode->i_sb_list * inode_wb_list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash * * Lock ordering: - * inode_lock - * inode->i_lock * * inode_sb_list_lock * inode->i_lock @@ -50,6 +50,13 @@ * * inode_wb_list_lock * inode->i_lock + * + * inode_hash_lock + * inode_sb_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock */ /* @@ -85,6 +92,8 @@ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Each inode can be on two separate lists. One is @@ -100,15 +109,6 @@ static unsigned int i_hash_shift __read_mostly; static LIST_HEAD(inode_lru); static DEFINE_SPINLOCK(inode_lru_lock); -static struct hlist_head *inode_hashtable __read_mostly; - -/* - * A simple spinlock to protect the list manipulations. - * - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ -DEFINE_SPINLOCK(inode_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); @@ -433,11 +433,11 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head(&inode->i_hash, b); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); @@ -449,11 +449,11 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void remove_inode_hash(struct inode *inode) { - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init(&inode->i_hash); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -778,11 +778,15 @@ static struct inode *find_inode(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_sb != sb) + spin_lock(&inode->i_lock); + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; - if (!test(inode, data)) + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); continue; - spin_lock(&inode->i_lock); + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -806,11 +810,15 @@ static struct inode *find_inode_fast(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_ino != ino) + spin_lock(&inode->i_lock); + if (inode->i_ino != ino) { + spin_unlock(&inode->i_lock); continue; - if (inode->i_sb != sb) + } + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; - spin_lock(&inode->i_lock); + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -924,7 +932,7 @@ void unlock_new_inode(struct inode *inode) EXPORT_SYMBOL(unlock_new_inode); /* - * This is called without the inode lock held.. Be careful. + * This is called without the inode hash lock held.. Be careful. * * We no longer cache the sb_flags in i_flags - see fs.h * -- rmk@arm.uk.linux.org @@ -941,7 +949,7 @@ static struct inode *get_new_inode(struct super_block *sb, if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode(sb, head, test, data); if (!old) { @@ -953,7 +961,7 @@ static struct inode *get_new_inode(struct super_block *sb, hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -966,7 +974,7 @@ static struct inode *get_new_inode(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -974,7 +982,7 @@ static struct inode *get_new_inode(struct super_block *sb, return inode; set_failed: - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); return NULL; } @@ -992,7 +1000,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino); if (!old) { @@ -1002,7 +1010,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -1015,7 +1023,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -1036,10 +1044,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino) struct hlist_node *node; struct inode *inode; + spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, node, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) + if (inode->i_ino == ino && inode->i_sb == sb) { + spin_unlock(&inode_hash_lock); return 0; + } } + spin_unlock(&inode_hash_lock); return 1; } @@ -1069,7 +1081,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) static unsigned int counter; ino_t res; - spin_lock(&inode_lock); spin_lock(&iunique_lock); do { if (counter <= max_reserved) @@ -1077,7 +1088,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); - spin_unlock(&inode_lock); return res; } @@ -1119,7 +1129,7 @@ EXPORT_SYMBOL(igrab); * * Otherwise NULL is returned. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note, @test is called with the inode_hash_lock held, so can't sleep. */ static struct inode *ifind(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), @@ -1127,15 +1137,15 @@ static struct inode *ifind(struct super_block *sb, { struct inode *inode; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); if (inode) { - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); if (likely(wait)) wait_on_inode(inode); return inode; } - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return NULL; } @@ -1159,14 +1169,14 @@ static struct inode *ifind_fast(struct super_block *sb, { struct inode *inode; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); if (inode) { - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(inode); return inode; } - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return NULL; } @@ -1189,7 +1199,7 @@ static struct inode *ifind_fast(struct super_block *sb, * * Otherwise NULL is returned. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note, @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) @@ -1217,7 +1227,7 @@ EXPORT_SYMBOL(ilookup5_nowait); * * Otherwise NULL is returned. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note, @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) @@ -1268,7 +1278,8 @@ EXPORT_SYMBOL(ilookup); * inode and this is returned locked, hashed, and with the I_NEW flag set. The * file system gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_lock held, so can't sleep. + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1328,7 +1339,7 @@ int insert_inode_locked(struct inode *inode) while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_ino != ino) continue; @@ -1346,12 +1357,12 @@ int insert_inode_locked(struct inode *inode) inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); spin_unlock(&old->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1372,7 +1383,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_sb != sb) continue; @@ -1390,12 +1401,12 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); spin_unlock(&old->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1674,10 +1685,10 @@ static void __wait_on_freeing_inode(struct inode *inode) wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wait); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); } static __initdata unsigned long ihash_entries; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index fb3b3c5ef0e..07ea8d3e6ea 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -22,7 +22,6 @@ #include #include #include -#include /* for inode_lock */ #include diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 325185e514b..50c00856f73 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,7 +91,6 @@ #include #include #include -#include /* for inode_lock */ #include diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 85eebff6d0d..e86577d6c5c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -23,7 +23,6 @@ #include #include #include -#include /* for inode_lock */ #include diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index a627ed82c0a..0b56c6b7ec0 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -54,7 +54,7 @@ * * Return 1 if the attributes match and 0 if not. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. */ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) @@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) * * Return 0 on success and -errno on error. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. (Hence the GFP_ATOMIC allocation.) */ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f5fee71832..17e7ccc322a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,7 +9,6 @@ struct backing_dev_info; -extern spinlock_t inode_lock; extern spinlock_t inode_wb_list_lock; /* -- cgit v1.2.3-70-g09d2 From 0b2d0724e26a335cd326eb7ad552c109116a8795 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Mar 2011 15:03:28 -0400 Subject: fs: simplify iget & friends Merge get_new_inode/get_new_inode_fast into iget5_locked/iget_locked as those were the only callers. Remove the internal ifind/ifind_fast helpers - ifind_fast only had a single caller, and ifind had two callers wanting it to do different things. Also clean up the comments in this area to focus on information important to a developer trying to use it, instead of overloading them with implementation details. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/inode.c | 262 ++++++++++++++++++++----------------------------------------- 1 file changed, 83 insertions(+), 179 deletions(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index f9ee4928358..05a1f75ae79 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -931,20 +931,42 @@ void unlock_new_inode(struct inode *inode) } EXPORT_SYMBOL(unlock_new_inode); -/* - * This is called without the inode hash lock held.. Be careful. +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a generalized version of iget_locked() for file systems where the inode + * number is not sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). * - * We no longer cache the sb_flags in i_flags - see fs.h - * -- rmk@arm.uk.linux.org + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. */ -static struct inode *get_new_inode(struct super_block *sb, - struct hlist_head *head, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), - void *data) +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) { + struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + if (inode) { + wait_on_inode(inode); + return inode; + } + inode = alloc_inode(sb); if (inode) { struct inode *old; @@ -986,16 +1008,34 @@ set_failed: destroy_inode(inode); return NULL; } +EXPORT_SYMBOL(iget5_locked); -/* - * get_new_inode_fast is the fast path version of get_new_inode, see the - * comment at iget_locked for details. +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * Search for the inode specified by @ino in the inode cache and if present + * return it with an increased reference count. This is for file systems + * where the inode number is sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). */ -static struct inode *get_new_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) +struct inode *iget_locked(struct super_block *sb, unsigned long ino) { + struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + if (inode) { + wait_on_inode(inode); + return inode; + } + inode = alloc_inode(sb); if (inode) { struct inode *old; @@ -1030,6 +1070,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, } return inode; } +EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. @@ -1113,100 +1154,32 @@ struct inode *igrab(struct inode *inode) EXPORT_SYMBOL(igrab); /** - * ifind - internal function, you want ilookup5() or iget5(). + * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search - * @head: the head of the list to search + * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test - * @wait: if true wait for the inode to be unlocked, if false do not - * - * ifind() searches for the inode specified by @data in the inode - * cache. This is a generalized version of ifind_fast() for file systems where - * the inode number is not sufficient for unique identification of an inode. * + * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented * reference count. * - * Otherwise NULL is returned. + * Note: I_NEW is not waited upon so you have to be very careful what you do + * with the returned inode. You probably should be using ilookup5() instead. * - * Note, @test is called with the inode_hash_lock held, so can't sleep. + * Note: @test is called with the inode_hash_lock held, so can't sleep. */ -static struct inode *ifind(struct super_block *sb, - struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, const int wait) +struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) { + struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); - if (inode) { - spin_unlock(&inode_hash_lock); - if (likely(wait)) - wait_on_inode(inode); - return inode; - } - spin_unlock(&inode_hash_lock); - return NULL; -} - -/** - * ifind_fast - internal function, you want ilookup() or iget(). - * @sb: super block of file system to search - * @head: head of the list to search - * @ino: inode number to search for - * - * ifind_fast() searches for the inode @ino in the inode cache. This is for - * file systems where the inode number is sufficient for unique identification - * of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. - */ -static struct inode *ifind_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) -{ - struct inode *inode; - - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - if (inode) { - spin_unlock(&inode_hash_lock); - wait_on_inode(inode); - return inode; - } spin_unlock(&inode_hash_lock); - return NULL; -} - -/** - * ilookup5_nowait - search for an inode in the inode cache - * @sb: super block of file system to search - * @hashval: hash value (usually inode number) to search for - * @test: callback used for comparisons between inodes - * @data: opaque data pointer to pass to @test - * - * ilookup5() uses ifind() to search for the inode specified by @hashval and - * @data in the inode cache. This is a generalized version of ilookup() for - * file systems where the inode number is not sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. Note, the inode lock is not waited upon so you have to be - * very careful what you do with the returned inode. You probably should be - * using ilookup5() instead. - * - * Otherwise NULL is returned. - * - * Note, @test is called with the inode_hash_lock held, so can't sleep. - */ -struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) -{ - struct hlist_head *head = inode_hashtable + hash(sb, hashval); - return ifind(sb, head, test, data, 0); + return inode; } EXPORT_SYMBOL(ilookup5_nowait); @@ -1217,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait); * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * - * ilookup5() uses ifind() to search for the inode specified by @hashval and - * @data in the inode cache. This is a generalized version of ilookup() for - * file systems where the inode number is not sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode lock is waited upon and the inode is + * Search for the inode specified by @hashval and @data in the inode cache, + * and if the inode is in the cache, return the inode with an incremented + * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * - * Otherwise NULL is returned. + * This is a generalized version of ilookup() for file systems where the + * inode number is not sufficient for unique identification of an inode. * - * Note, @test is called with the inode_hash_lock held, so can't sleep. + * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode = ilookup5_nowait(sb, hashval, test, data); - return ifind(sb, head, test, data, 1); + if (inode) + wait_on_inode(inode); + return inode; } EXPORT_SYMBOL(ilookup5); @@ -1243,92 +1216,23 @@ EXPORT_SYMBOL(ilookup5); * @sb: super block of file system to search * @ino: inode number to search for * - * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. - * This is for file systems where the inode number is sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. + * Search for the inode @ino in the inode cache, and if the inode is in the + * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); - - return ifind_fast(sb, head, ino); -} -EXPORT_SYMBOL(ilookup); - -/** - * iget5_locked - obtain an inode from a mounted file system - * @sb: super block of file system - * @hashval: hash value (usually inode number) to get - * @test: callback used for comparisons between inodes - * @set: callback used to initialize a new struct inode - * @data: opaque data pointer to pass to @test and @set - * - * iget5_locked() uses ifind() to search for the inode specified by @hashval - * and @data in the inode cache and if present it is returned with an increased - * reference count. This is a generalized version of iget_locked() for file - * systems where the inode number is not sufficient for unique identification - * of an inode. - * - * If the inode is not in cache, get_new_inode() is called to allocate a new - * inode and this is returned locked, hashed, and with the I_NEW flag set. The - * file system gets to fill it in before unlocking it via unlock_new_inode(). - * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. - */ -struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *data) -{ - struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - inode = ifind(sb, head, test, data, 1); - if (inode) - return inode; - /* - * get_new_inode() will do the right thing, re-trying the search - * in case it had to block at any point. - */ - return get_new_inode(sb, head, test, set, data); -} -EXPORT_SYMBOL(iget5_locked); - -/** - * iget_locked - obtain an inode from a mounted file system - * @sb: super block of file system - * @ino: inode number to get - * - * iget_locked() uses ifind_fast() to search for the inode specified by @ino in - * the inode cache and if present it is returned with an increased reference - * count. This is for file systems where the inode number is sufficient for - * unique identification of an inode. - * - * If the inode is not in cache, get_new_inode_fast() is called to allocate a - * new inode and this is returned locked, hashed, and with the I_NEW flag set. - * The file system gets to fill it in before unlocking it via - * unlock_new_inode(). - */ -struct inode *iget_locked(struct super_block *sb, unsigned long ino) -{ - struct hlist_head *head = inode_hashtable + hash(sb, ino); - struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); - inode = ifind_fast(sb, head, ino); if (inode) - return inode; - /* - * get_new_inode_fast() will do the right thing, re-trying the search - * in case it had to block at any point. - */ - return get_new_inode_fast(sb, head, ino); + wait_on_inode(inode); + return inode; } -EXPORT_SYMBOL(iget_locked); +EXPORT_SYMBOL(ilookup); int insert_inode_locked(struct inode *inode) { -- cgit v1.2.3-70-g09d2 From b6d0ad686da95fa85ce0c583ec35017bf1583563 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Mar 2011 13:27:47 -0700 Subject: fs: fix inode.c kernel-doc warning Fix inode.c kernel-doc fatal error: 2 comment sections have the same name: Error(fs/inode.c:1171): duplicate section name 'Note' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/inode.c') diff --git a/fs/inode.c b/fs/inode.c index 05a1f75ae79..5f4e11aaeb5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1167,7 +1167,7 @@ EXPORT_SYMBOL(igrab); * Note: I_NEW is not waited upon so you have to be very careful what you do * with the returned inode. You probably should be using ilookup5() instead. * - * Note: @test is called with the inode_hash_lock held, so can't sleep. + * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) -- cgit v1.2.3-70-g09d2